diff --git a/cddl/lib/libzfs/Makefile b/cddl/lib/libzfs/Makefile index 6f8965e4c14a..ee57c30cc6ca 100644 --- a/cddl/lib/libzfs/Makefile +++ b/cddl/lib/libzfs/Makefile @@ -1,107 +1,107 @@ .PATH: ${SRCTOP}/sys/contrib/openzfs/module/icp .PATH: ${SRCTOP}/sys/contrib/openzfs/module/zcommon .PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs .PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzfs/os/freebsd .PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libshare .PATH: ${SRCTOP}/sys/contrib/openzfs/include .PATH: ${SRCTOP}/sys/contrib/openzfs/module/zstd .PATH: ${SRCTOP}/sys/contrib/openzfs/module/zstd/lib PACKAGE= zfs LIB= zfs LIBADD= \ avl \ bsdxml \ crypto \ geom \ m \ md \ nvpair \ pthread \ rt \ umem \ util \ uutil \ z \ zfs_core \ zutil INCS= libzfs.h USER_C = \ libzfs_changelist.c \ libzfs_config.c \ libzfs_crypto.c \ libzfs_dataset.c \ libzfs_diff.c \ libzfs_import.c \ libzfs_iter.c \ libzfs_mount.c \ libzfs_pool.c \ libzfs_sendrecv.c \ libzfs_status.c \ libzfs_util.c # FreeBSD USER_C += \ libzfs_compat.c \ libzfs_zmount.c # libshare USER_C += \ libshare.c \ nfs.c \ os/freebsd/nfs.c \ os/freebsd/smb.c KERNEL_C = \ cityhash.c \ zfeature_common.c \ zfs_comutil.c \ zfs_deleg.c \ zfs_fletcher.c \ zfs_fletcher_superscalar.c \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ zpool_prop.c \ zprop_common.c ARCH_C = .if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386" ARCH_C += zfs_fletcher_intel.c \ zfs_fletcher_sse.c CFLAGS += -DHAVE_SSE2 .endif .if ${MACHINE_ARCH} == "amd64" ARCH_C += zfs_fletcher_avx512.c CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_AVX512F .endif .if ${MACHINE_CPUARCH} == "aarch64" ARCH_C += zfs_fletcher_aarch64_neon.c .endif SRCS= $(USER_C) $(KERNEL_C) $(ARCH_C) WARNS?= 2 SHLIB_MAJOR= 4 CSTD= c99 CFLAGS+= -DIN_BASE CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libshare CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include CFLAGS+= -I${SRCTOP}/sys CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/module/icp/include CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h CFLAGS+= -DHAVE_ISSETUGID CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h CFLAGS+= -DSYSCONFDIR=\"/etc\" CFLAGS+= -DPKGDATADIR=\"/usr/share/zfs\" - +CFLAGS+= -DZFSEXECDIR=\"${LIBEXECDIR}/zfs\" .include diff --git a/cddl/usr.libexec/Makefile b/cddl/usr.libexec/Makefile index 692f16d33717..9eb4872b69d5 100644 --- a/cddl/usr.libexec/Makefile +++ b/cddl/usr.libexec/Makefile @@ -1,9 +1,10 @@ .include SUBDIR.${MK_ZFS}+= \ + zfs_prepare_disk \ zpool_influxdb SUBDIR_PARALLEL= .include diff --git a/cddl/usr.libexec/zfs_prepare_disk/Makefile b/cddl/usr.libexec/zfs_prepare_disk/Makefile new file mode 100644 index 000000000000..0efb4452829d --- /dev/null +++ b/cddl/usr.libexec/zfs_prepare_disk/Makefile @@ -0,0 +1,19 @@ +ZFSTOP= ${SRCTOP}/sys/contrib/openzfs + +.PATH: ${ZFSTOP}/scripts + +PACKAGE= zfs +SCRIPTS= zfs_prepare_disk +SCRIPTSDIR= ${LIBEXECDIR}/zfs + +.PATH: ${ZFSTOP}/man/man8 +MAN= zfs_prepare_disk.8 + +CLEANFILES+= zfs_prepare_disk.8 + +zfs_prepare_disk.8: zfs_prepare_disk.8.in + sed ${MAN_SUB} ${.ALLSRC} >${.TARGET} + +MAN_SUB+= -e 's|@zfsexecdir@|${LIBEXECDIR}/zfs|g' + +.include diff --git a/sys/contrib/openzfs/.gitignore b/sys/contrib/openzfs/.gitignore index 47d17ae16d34..a2cb92dd5406 100644 --- a/sys/contrib/openzfs/.gitignore +++ b/sys/contrib/openzfs/.gitignore @@ -1,89 +1,90 @@ # # This is the top-level .gitignore file: # ignore everything except a list of allowed files. # # This is not the place for entries that are specific to # a subdirectory. Instead add those files to the # .gitignore file in that subdirectory. # # N.B. # Please use 'git ls-files -i --exclude-standard' # command after changing this file, to see if there are # any tracked files which get ignored after the change. * !.github !cmd !config !contrib !etc !include !lib !man !module !rpm !scripts !tests !udev !.github/** !cmd/** !config/** !contrib/** !etc/** !include/** !lib/** !man/** !module/** !rpm/** !scripts/** !tests/** !udev/** !.editorconfig !.cirrus.yml !.gitignore !.gitmodules !.mailmap !AUTHORS !autogen.sh !CODE_OF_CONDUCT.md !configure.ac !copy-builtin !COPYRIGHT !LICENSE !Makefile.am !META !NEWS !NOTICE !README.md !RELEASES.md !TEST !zfs.release.in # # Normal rules # *.[oa] *.o.ur-safe *.lo *.la *.mod.c *~ *.swp *.gcno *.gcda *.pyc *.pyo .deps .libs .dirstamp .DS_Store modules.order Makefile Makefile.in +changelog *.patch *.orig *.tmp *.log diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META index 0d7df10d47db..5868838a26df 100644 --- a/sys/contrib/openzfs/META +++ b/sys/contrib/openzfs/META @@ -1,10 +1,10 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.2.0 +Version: 2.2.1 Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.5 +Linux-Maximum: 6.6 Linux-Minimum: 3.10 diff --git a/sys/contrib/openzfs/cmd/arc_summary b/sys/contrib/openzfs/cmd/arc_summary index 426e0207052d..9c69ec4f8ccc 100755 --- a/sys/contrib/openzfs/cmd/arc_summary +++ b/sys/contrib/openzfs/cmd/arc_summary @@ -1,1034 +1,1034 @@ #!/usr/bin/env python3 # # Copyright (c) 2008 Ben Rockwood , # Copyright (c) 2010 Martin Matuska , # Copyright (c) 2010-2011 Jason J. Hellenthal , # Copyright (c) 2017 Scot W. Stevenson # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. """Print statistics on the ZFS ARC Cache and other information Provides basic information on the ARC, its efficiency, the L2ARC (if present), the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the in-source documentation and code at https://github.com/openzfs/zfs/blob/master/module/zfs/arc.c for details. The original introduction to arc_summary can be found at http://cuddletech.com/?p=454 """ import argparse import os import subprocess import sys import time import errno # We can't use env -S portably, and we need python3 -u to handle pipes in # the shell abruptly closing the way we want to, so... import io if isinstance(sys.__stderr__.buffer, io.BufferedWriter): os.execv(sys.executable, [sys.executable, "-u"] + sys.argv) DESCRIPTION = 'Print ARC and other statistics for OpenZFS' INDENT = ' '*8 LINE_LENGTH = 72 DATE_FORMAT = '%a %b %d %H:%M:%S %Y' TITLE = 'ZFS Subsystem Report' SECTIONS = 'arc archits dmu l2arc spl tunables vdev zil'.split() SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')' # Tunables and SPL are handled separately because they come from # different sources SECTION_PATHS = {'arc': 'arcstats', 'dmu': 'dmu_tx', 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats 'zfetch': 'zfetchstats', 'zil': 'zil'} parser = argparse.ArgumentParser(description=DESCRIPTION) parser.add_argument('-a', '--alternate', action='store_true', default=False, help='use alternate formatting for tunables and SPL', dest='alt') parser.add_argument('-d', '--description', action='store_true', default=False, help='print descriptions with tunables and SPL', dest='desc') parser.add_argument('-g', '--graph', action='store_true', default=False, help='print graph on ARC use and exit', dest='graph') parser.add_argument('-p', '--page', type=int, dest='page', help='print page by number (DEPRECATED, use "-s")') parser.add_argument('-r', '--raw', action='store_true', default=False, help='dump all available data with minimal formatting', dest='raw') parser.add_argument('-s', '--section', dest='section', help=SECTION_HELP) ARGS = parser.parse_args() if sys.platform.startswith('freebsd'): # Requires py36-sysctl on FreeBSD import sysctl def is_value(ctl): return ctl.type != sysctl.CTLTYPE_NODE def namefmt(ctl, base='vfs.zfs.'): # base is removed from the name cut = len(base) return ctl.name[cut:] def load_kstats(section): base = 'kstat.zfs.misc.{section}.'.format(section=section) fmt = lambda kstat: '{name} : {value}'.format(name=namefmt(kstat, base), value=kstat.value) kstats = sysctl.filter(base) return [fmt(kstat) for kstat in kstats if is_value(kstat)] def get_params(base): ctls = sysctl.filter(base) return {namefmt(ctl): str(ctl.value) for ctl in ctls if is_value(ctl)} def get_tunable_params(): return get_params('vfs.zfs') def get_vdev_params(): return get_params('vfs.zfs.vdev') def get_version_impl(request): # FreeBSD reports versions for zpl and spa instead of zfs and spl. name = {'zfs': 'zpl', 'spl': 'spa'}[request] mib = 'vfs.zfs.version.{}'.format(name) version = sysctl.filter(mib)[0].value return '{} version {}'.format(name, version) def get_descriptions(_request): ctls = sysctl.filter('vfs.zfs') return {namefmt(ctl): ctl.description for ctl in ctls if is_value(ctl)} elif sys.platform.startswith('linux'): KSTAT_PATH = '/proc/spl/kstat/zfs' SPL_PATH = '/sys/module/spl/parameters' TUNABLES_PATH = '/sys/module/zfs/parameters' def load_kstats(section): path = os.path.join(KSTAT_PATH, section) with open(path) as f: return list(f)[2:] # Get rid of header def get_params(basepath): """Collect information on the Solaris Porting Layer (SPL) or the tunables, depending on the PATH given. Does not check if PATH is legal. """ result = {} for name in os.listdir(basepath): path = os.path.join(basepath, name) with open(path) as f: value = f.read() result[name] = value.strip() return result def get_spl_params(): return get_params(SPL_PATH) def get_tunable_params(): return get_params(TUNABLES_PATH) def get_vdev_params(): return get_params(TUNABLES_PATH) def get_version_impl(request): # The original arc_summary called /sbin/modinfo/{spl,zfs} to get # the version information. We switch to /sys/module/{spl,zfs}/version # to make sure we get what is really loaded in the kernel try: with open("/sys/module/{}/version".format(request)) as f: return f.read().strip() except: return "(unknown)" def get_descriptions(request): """Get the descriptions of the Solaris Porting Layer (SPL) or the tunables, return with minimal formatting. """ if request not in ('spl', 'zfs'): print('ERROR: description of "{0}" requested)'.format(request)) sys.exit(1) descs = {} target_prefix = 'parm:' # We would prefer to do this with /sys/modules -- see the discussion at # get_version() -- but there isn't a way to get the descriptions from # there, so we fall back on modinfo command = ["/sbin/modinfo", request, "-0"] info = '' try: info = subprocess.run(command, stdout=subprocess.PIPE, check=True, universal_newlines=True) raw_output = info.stdout.split('\0') except subprocess.CalledProcessError: print("Error: Descriptions not available", "(can't access kernel module)") sys.exit(1) for line in raw_output: if not line.startswith(target_prefix): continue line = line[len(target_prefix):].strip() name, raw_desc = line.split(':', 1) desc = raw_desc.rsplit('(', 1)[0] if desc == '': desc = '(No description found)' descs[name.strip()] = desc.strip() return descs def handle_unraisableException(exc_type, exc_value=None, exc_traceback=None, err_msg=None, object=None): handle_Exception(exc_type, object, exc_traceback) def handle_Exception(ex_cls, ex, tb): if ex_cls is KeyboardInterrupt: sys.exit() if ex_cls is BrokenPipeError: # It turns out that while sys.exit() triggers an exception # not handled message on Python 3.8+, os._exit() does not. os._exit(0) if ex_cls is OSError: if ex.errno == errno.ENOTCONN: sys.exit() raise ex if hasattr(sys,'unraisablehook'): # Python 3.8+ sys.unraisablehook = handle_unraisableException sys.excepthook = handle_Exception def cleanup_line(single_line): """Format a raw line of data from /proc and isolate the name value part, returning a tuple with each. Currently, this gets rid of the middle '4'. For example "arc_no_grow 4 0" returns the tuple ("arc_no_grow", "0"). """ name, _, value = single_line.split() return name, value def draw_graph(kstats_dict): """Draw a primitive graph representing the basic information on the ARC -- its size and the proportion used by MFU and MRU -- and quit. We use max size of the ARC to calculate how full it is. This is a very rough representation. """ arc_stats = isolate_section('arcstats', kstats_dict) GRAPH_INDENT = ' '*4 GRAPH_WIDTH = 60 arc_size = f_bytes(arc_stats['size']) arc_perc = f_perc(arc_stats['size'], arc_stats['c_max']) mfu_size = f_bytes(arc_stats['mfu_size']) mru_size = f_bytes(arc_stats['mru_size']) meta_size = f_bytes(arc_stats['arc_meta_used']) dnode_limit = f_bytes(arc_stats['arc_dnode_limit']) dnode_size = f_bytes(arc_stats['dnode_size']) info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ' 'DNODE {5} ({6})') info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size, meta_size, dnode_size, dnode_limit) info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2) info_line = GRAPH_INDENT+info_spc+info_line graph_line = GRAPH_INDENT+'+'+('-'*(GRAPH_WIDTH-2))+'+' mfu_perc = float(int(arc_stats['mfu_size'])/int(arc_stats['c_max'])) mru_perc = float(int(arc_stats['mru_size'])/int(arc_stats['c_max'])) arc_perc = float(int(arc_stats['size'])/int(arc_stats['c_max'])) total_ticks = float(arc_perc)*GRAPH_WIDTH mfu_ticks = mfu_perc*GRAPH_WIDTH mru_ticks = mru_perc*GRAPH_WIDTH other_ticks = total_ticks-(mfu_ticks+mru_ticks) core_form = 'F'*int(mfu_ticks)+'R'*int(mru_ticks)+'O'*int(other_ticks) core_spc = ' '*(GRAPH_WIDTH-(2+len(core_form))) core_line = GRAPH_INDENT+'|'+core_form+core_spc+'|' for line in ('', info_line, graph_line, core_line, graph_line, ''): print(line) def f_bytes(byte_string): """Return human-readable representation of a byte value in powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal points. Values smaller than one KiB are returned without decimal points. Note "bytes" is a reserved keyword. """ prefixes = ([2**80, "YiB"], # yobibytes (yotta) [2**70, "ZiB"], # zebibytes (zetta) [2**60, "EiB"], # exbibytes (exa) [2**50, "PiB"], # pebibytes (peta) [2**40, "TiB"], # tebibytes (tera) [2**30, "GiB"], # gibibytes (giga) [2**20, "MiB"], # mebibytes (mega) [2**10, "KiB"]) # kibibytes (kilo) bites = int(byte_string) if bites >= 2**10: for limit, unit in prefixes: if bites >= limit: value = bites / limit break result = '{0:.1f} {1}'.format(value, unit) else: result = '{0} Bytes'.format(bites) return result def f_hits(hits_string): """Create a human-readable representation of the number of hits. The single-letter symbols used are SI to avoid the confusion caused by the different "short scale" and "long scale" representations in English, which use the same words for different values. See https://en.wikipedia.org/wiki/Names_of_large_numbers and: https://physics.nist.gov/cuu/Units/prefixes.html """ numbers = ([10**24, 'Y'], # yotta (septillion) [10**21, 'Z'], # zetta (sextillion) [10**18, 'E'], # exa (quintrillion) [10**15, 'P'], # peta (quadrillion) [10**12, 'T'], # tera (trillion) [10**9, 'G'], # giga (billion) [10**6, 'M'], # mega (million) [10**3, 'k']) # kilo (thousand) hits = int(hits_string) if hits >= 1000: for limit, symbol in numbers: if hits >= limit: value = hits/limit break result = "%0.1f%s" % (value, symbol) else: result = "%d" % hits return result def f_perc(value1, value2): """Calculate percentage and return in human-readable form. If rounding produces the result '0.0' though the first number is not zero, include a 'less-than' symbol to avoid confusion. Division by zero is handled by returning 'n/a'; no error is called. """ v1 = float(value1) v2 = float(value2) try: perc = 100 * v1/v2 except ZeroDivisionError: result = 'n/a' else: result = '{0:0.1f} %'.format(perc) if result == '0.0 %' and v1 > 0: result = '< 0.1 %' return result def format_raw_line(name, value): """For the --raw option for the tunable and SPL outputs, decide on the correct formatting based on the --alternate flag. """ if ARGS.alt: result = '{0}{1}={2}'.format(INDENT, name, value) else: # Right-align the value within the line length if it fits, # otherwise just separate it from the name by a single space. fit = LINE_LENGTH - len(INDENT) - len(name) overflow = len(value) + 1 w = max(fit, overflow) result = '{0}{1}{2:>{w}}'.format(INDENT, name, value, w=w) return result def get_kstats(): """Collect information on the ZFS subsystem. The step does not perform any further processing, giving us the option to only work on what is actually needed. The name "kstat" is a holdover from the Solaris utility of the same name. """ result = {} for section in SECTION_PATHS.values(): if section not in result: result[section] = load_kstats(section) return result def get_version(request): """Get the version number of ZFS or SPL on this machine for header. Returns an error string, but does not raise an error, if we can't get the ZFS/SPL version. """ if request not in ('spl', 'zfs'): error_msg = '(ERROR: "{0}" requested)'.format(request) return error_msg return get_version_impl(request) def print_header(): """Print the initial heading with date and time as well as info on the kernel and ZFS versions. This is not called for the graph. """ # datetime is now recommended over time but we keep the exact formatting # from the older version of arc_summary in case there are scripts # that expect it in this way daydate = time.strftime(DATE_FORMAT) spc_date = LINE_LENGTH-len(daydate) sys_version = os.uname() sys_msg = sys_version.sysname+' '+sys_version.release zfs = get_version('zfs') spc_zfs = LINE_LENGTH-len(zfs) machine_msg = 'Machine: '+sys_version.nodename+' ('+sys_version.machine+')' spl = get_version('spl') spc_spl = LINE_LENGTH-len(spl) print('\n'+('-'*LINE_LENGTH)) print('{0:<{spc}}{1}'.format(TITLE, daydate, spc=spc_date)) print('{0:<{spc}}{1}'.format(sys_msg, zfs, spc=spc_zfs)) print('{0:<{spc}}{1}\n'.format(machine_msg, spl, spc=spc_spl)) def print_raw(kstats_dict): """Print all available data from the system in a minimally sorted format. This can be used as a source to be piped through 'grep'. """ sections = sorted(kstats_dict.keys()) for section in sections: print('\n{0}:'.format(section.upper())) lines = sorted(kstats_dict[section]) for line in lines: name, value = cleanup_line(line) print(format_raw_line(name, value)) # Tunables and SPL must be handled separately because they come from a # different source and have descriptions the user might request print() section_spl() section_tunables() def isolate_section(section_name, kstats_dict): """From the complete information on all sections, retrieve only those for one section. """ try: section_data = kstats_dict[section_name] except KeyError: print('ERROR: Data on {0} not available'.format(section_data)) sys.exit(1) section_dict = dict(cleanup_line(l) for l in section_data) return section_dict # Formatted output helper functions def prt_1(text, value): """Print text and one value, no indent""" spc = ' '*(LINE_LENGTH-(len(text)+len(value))) print('{0}{spc}{1}'.format(text, value, spc=spc)) def prt_i1(text, value): """Print text and one value, with indent""" spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(value))) print(INDENT+'{0}{spc}{1}'.format(text, value, spc=spc)) def prt_2(text, value1, value2): """Print text and two values, no indent""" values = '{0:>9} {1:>9}'.format(value1, value2) spc = ' '*(LINE_LENGTH-(len(text)+len(values)+2)) print('{0}{spc} {1}'.format(text, values, spc=spc)) def prt_i2(text, value1, value2): """Print text and two values, with indent""" values = '{0:>9} {1:>9}'.format(value1, value2) spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(values)+2)) print(INDENT+'{0}{spc} {1}'.format(text, values, spc=spc)) # The section output concentrates on important parameters instead of # being exhaustive (that is what the --raw parameter is for) def section_arc(kstats_dict): """Give basic information on the ARC, MRU and MFU. This is the first and most used section. """ arc_stats = isolate_section('arcstats', kstats_dict) throttle = arc_stats['memory_throttle_count'] if throttle == '0': health = 'HEALTHY' else: health = 'THROTTLED' prt_1('ARC status:', health) prt_i1('Memory throttle count:', throttle) print() arc_size = arc_stats['size'] arc_target_size = arc_stats['c'] arc_max = arc_stats['c_max'] arc_min = arc_stats['c_min'] meta = arc_stats['meta'] pd = arc_stats['pd'] pm = arc_stats['pm'] anon_data = arc_stats['anon_data'] anon_metadata = arc_stats['anon_metadata'] mfu_data = arc_stats['mfu_data'] mfu_metadata = arc_stats['mfu_metadata'] mru_data = arc_stats['mru_data'] mru_metadata = arc_stats['mru_metadata'] mfug_data = arc_stats['mfu_ghost_data'] mfug_metadata = arc_stats['mfu_ghost_metadata'] mrug_data = arc_stats['mru_ghost_data'] mrug_metadata = arc_stats['mru_ghost_metadata'] unc_data = arc_stats['uncached_data'] unc_metadata = arc_stats['uncached_metadata'] bonus_size = arc_stats['bonus_size'] dnode_limit = arc_stats['arc_dnode_limit'] dnode_size = arc_stats['dnode_size'] dbuf_size = arc_stats['dbuf_size'] hdr_size = arc_stats['hdr_size'] l2_hdr_size = arc_stats['l2_hdr_size'] abd_chunk_waste_size = arc_stats['abd_chunk_waste_size'] target_size_ratio = '{0}:1'.format(int(arc_max) // int(arc_min)) prt_2('ARC size (current):', f_perc(arc_size, arc_max), f_bytes(arc_size)) prt_i2('Target size (adaptive):', f_perc(arc_target_size, arc_max), f_bytes(arc_target_size)) prt_i2('Min size (hard limit):', f_perc(arc_min, arc_max), f_bytes(arc_min)) prt_i2('Max size (high water):', target_size_ratio, f_bytes(arc_max)) caches_size = int(anon_data)+int(anon_metadata)+\ int(mfu_data)+int(mfu_metadata)+int(mru_data)+int(mru_metadata)+\ int(unc_data)+int(unc_metadata) prt_i2('Anonymous data size:', f_perc(anon_data, caches_size), f_bytes(anon_data)) prt_i2('Anonymous metadata size:', f_perc(anon_metadata, caches_size), f_bytes(anon_metadata)) s = 4294967296 v = (s-int(pd))*(s-int(meta))/s prt_i2('MFU data target:', f_perc(v, s), f_bytes(v / 65536 * caches_size / 65536)) prt_i2('MFU data size:', f_perc(mfu_data, caches_size), f_bytes(mfu_data)) prt_i1('MFU ghost data size:', f_bytes(mfug_data)) v = (s-int(pm))*int(meta)/s prt_i2('MFU metadata target:', f_perc(v, s), f_bytes(v / 65536 * caches_size / 65536)) prt_i2('MFU metadata size:', f_perc(mfu_metadata, caches_size), f_bytes(mfu_metadata)) prt_i1('MFU ghost metadata size:', f_bytes(mfug_metadata)) v = int(pd)*(s-int(meta))/s prt_i2('MRU data target:', f_perc(v, s), f_bytes(v / 65536 * caches_size / 65536)) prt_i2('MRU data size:', f_perc(mru_data, caches_size), f_bytes(mru_data)) prt_i1('MRU ghost data size:', f_bytes(mrug_data)) v = int(pm)*int(meta)/s prt_i2('MRU metadata target:', f_perc(v, s), f_bytes(v / 65536 * caches_size / 65536)) prt_i2('MRU metadata size:', f_perc(mru_metadata, caches_size), f_bytes(mru_metadata)) prt_i1('MRU ghost metadata size:', f_bytes(mrug_metadata)) prt_i2('Uncached data size:', f_perc(unc_data, caches_size), f_bytes(unc_data)) prt_i2('Uncached metadata size:', f_perc(unc_metadata, caches_size), f_bytes(unc_metadata)) prt_i2('Bonus size:', f_perc(bonus_size, arc_size), f_bytes(bonus_size)) prt_i2('Dnode cache target:', f_perc(dnode_limit, arc_max), f_bytes(dnode_limit)) prt_i2('Dnode cache size:', f_perc(dnode_size, dnode_limit), f_bytes(dnode_size)) prt_i2('Dbuf size:', f_perc(dbuf_size, arc_size), f_bytes(dbuf_size)) prt_i2('Header size:', f_perc(hdr_size, arc_size), f_bytes(hdr_size)) prt_i2('L2 header size:', f_perc(l2_hdr_size, arc_size), f_bytes(l2_hdr_size)) prt_i2('ABD chunk waste size:', f_perc(abd_chunk_waste_size, arc_size), f_bytes(abd_chunk_waste_size)) print() print('ARC hash breakdown:') prt_i1('Elements max:', f_hits(arc_stats['hash_elements_max'])) prt_i2('Elements current:', f_perc(arc_stats['hash_elements'], arc_stats['hash_elements_max']), f_hits(arc_stats['hash_elements'])) prt_i1('Collisions:', f_hits(arc_stats['hash_collisions'])) prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max'])) prt_i1('Chains:', f_hits(arc_stats['hash_chains'])) print() print('ARC misc:') prt_i1('Deleted:', f_hits(arc_stats['deleted'])) prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss'])) prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip'])) prt_i1('Eviction skips due to L2 writes:', f_hits(arc_stats['evict_l2_skip'])) prt_i1('L2 cached evictions:', f_bytes(arc_stats['evict_l2_cached'])) prt_i1('L2 eligible evictions:', f_bytes(arc_stats['evict_l2_eligible'])) prt_i2('L2 eligible MFU evictions:', f_perc(arc_stats['evict_l2_eligible_mfu'], arc_stats['evict_l2_eligible']), f_bytes(arc_stats['evict_l2_eligible_mfu'])) prt_i2('L2 eligible MRU evictions:', f_perc(arc_stats['evict_l2_eligible_mru'], arc_stats['evict_l2_eligible']), f_bytes(arc_stats['evict_l2_eligible_mru'])) prt_i1('L2 ineligible evictions:', f_bytes(arc_stats['evict_l2_ineligible'])) print() def section_archits(kstats_dict): """Print information on how the caches are accessed ("arc hits"). """ arc_stats = isolate_section('arcstats', kstats_dict) all_accesses = int(arc_stats['hits'])+int(arc_stats['iohits'])+\ int(arc_stats['misses']) prt_1('ARC total accesses:', f_hits(all_accesses)) ta_todo = (('Total hits:', arc_stats['hits']), ('Total I/O hits:', arc_stats['iohits']), ('Total misses:', arc_stats['misses'])) for title, value in ta_todo: prt_i2(title, f_perc(value, all_accesses), f_hits(value)) print() dd_total = int(arc_stats['demand_data_hits']) +\ int(arc_stats['demand_data_iohits']) +\ int(arc_stats['demand_data_misses']) prt_2('ARC demand data accesses:', f_perc(dd_total, all_accesses), f_hits(dd_total)) dd_todo = (('Demand data hits:', arc_stats['demand_data_hits']), ('Demand data I/O hits:', arc_stats['demand_data_iohits']), ('Demand data misses:', arc_stats['demand_data_misses'])) for title, value in dd_todo: prt_i2(title, f_perc(value, dd_total), f_hits(value)) print() dm_total = int(arc_stats['demand_metadata_hits']) +\ int(arc_stats['demand_metadata_iohits']) +\ int(arc_stats['demand_metadata_misses']) prt_2('ARC demand metadata accesses:', f_perc(dm_total, all_accesses), f_hits(dm_total)) dm_todo = (('Demand metadata hits:', arc_stats['demand_metadata_hits']), ('Demand metadata I/O hits:', arc_stats['demand_metadata_iohits']), ('Demand metadata misses:', arc_stats['demand_metadata_misses'])) for title, value in dm_todo: prt_i2(title, f_perc(value, dm_total), f_hits(value)) print() pd_total = int(arc_stats['prefetch_data_hits']) +\ int(arc_stats['prefetch_data_iohits']) +\ int(arc_stats['prefetch_data_misses']) - prt_2('ARC prefetch metadata accesses:', f_perc(pd_total, all_accesses), + prt_2('ARC prefetch data accesses:', f_perc(pd_total, all_accesses), f_hits(pd_total)) pd_todo = (('Prefetch data hits:', arc_stats['prefetch_data_hits']), ('Prefetch data I/O hits:', arc_stats['prefetch_data_iohits']), ('Prefetch data misses:', arc_stats['prefetch_data_misses'])) for title, value in pd_todo: prt_i2(title, f_perc(value, pd_total), f_hits(value)) print() pm_total = int(arc_stats['prefetch_metadata_hits']) +\ int(arc_stats['prefetch_metadata_iohits']) +\ int(arc_stats['prefetch_metadata_misses']) prt_2('ARC prefetch metadata accesses:', f_perc(pm_total, all_accesses), f_hits(pm_total)) pm_todo = (('Prefetch metadata hits:', arc_stats['prefetch_metadata_hits']), ('Prefetch metadata I/O hits:', arc_stats['prefetch_metadata_iohits']), ('Prefetch metadata misses:', arc_stats['prefetch_metadata_misses'])) for title, value in pm_todo: prt_i2(title, f_perc(value, pm_total), f_hits(value)) print() all_prefetches = int(arc_stats['predictive_prefetch'])+\ int(arc_stats['prescient_prefetch']) prt_2('ARC predictive prefetches:', f_perc(arc_stats['predictive_prefetch'], all_prefetches), f_hits(arc_stats['predictive_prefetch'])) prt_i2('Demand hits after predictive:', f_perc(arc_stats['demand_hit_predictive_prefetch'], arc_stats['predictive_prefetch']), f_hits(arc_stats['demand_hit_predictive_prefetch'])) prt_i2('Demand I/O hits after predictive:', f_perc(arc_stats['demand_iohit_predictive_prefetch'], arc_stats['predictive_prefetch']), f_hits(arc_stats['demand_iohit_predictive_prefetch'])) never = int(arc_stats['predictive_prefetch']) -\ int(arc_stats['demand_hit_predictive_prefetch']) -\ int(arc_stats['demand_iohit_predictive_prefetch']) prt_i2('Never demanded after predictive:', f_perc(never, arc_stats['predictive_prefetch']), f_hits(never)) print() prt_2('ARC prescient prefetches:', f_perc(arc_stats['prescient_prefetch'], all_prefetches), f_hits(arc_stats['prescient_prefetch'])) prt_i2('Demand hits after prescient:', f_perc(arc_stats['demand_hit_prescient_prefetch'], arc_stats['prescient_prefetch']), f_hits(arc_stats['demand_hit_prescient_prefetch'])) prt_i2('Demand I/O hits after prescient:', f_perc(arc_stats['demand_iohit_prescient_prefetch'], arc_stats['prescient_prefetch']), f_hits(arc_stats['demand_iohit_prescient_prefetch'])) never = int(arc_stats['prescient_prefetch'])-\ int(arc_stats['demand_hit_prescient_prefetch'])-\ int(arc_stats['demand_iohit_prescient_prefetch']) prt_i2('Never demanded after prescient:', f_perc(never, arc_stats['prescient_prefetch']), f_hits(never)) print() print('ARC states hits of all accesses:') cl_todo = (('Most frequently used (MFU):', arc_stats['mfu_hits']), ('Most recently used (MRU):', arc_stats['mru_hits']), ('Most frequently used (MFU) ghost:', arc_stats['mfu_ghost_hits']), ('Most recently used (MRU) ghost:', arc_stats['mru_ghost_hits']), ('Uncached:', arc_stats['uncached_hits'])) for title, value in cl_todo: prt_i2(title, f_perc(value, all_accesses), f_hits(value)) print() def section_dmu(kstats_dict): """Collect information on the DMU""" zfetch_stats = isolate_section('zfetchstats', kstats_dict) zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses']) prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total)) prt_i2('Stream hits:', f_perc(zfetch_stats['hits'], zfetch_access_total), f_hits(zfetch_stats['hits'])) prt_i2('Stream misses:', f_perc(zfetch_stats['misses'], zfetch_access_total), f_hits(zfetch_stats['misses'])) prt_i2('Streams limit reached:', f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']), f_hits(zfetch_stats['max_streams'])) prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued'])) print() def section_l2arc(kstats_dict): """Collect information on L2ARC device if present. If not, tell user that we're skipping the section. """ # The L2ARC statistics live in the same section as the normal ARC stuff arc_stats = isolate_section('arcstats', kstats_dict) if arc_stats['l2_size'] == '0': print('L2ARC not detected, skipping section\n') return l2_errors = int(arc_stats['l2_writes_error']) +\ int(arc_stats['l2_cksum_bad']) +\ int(arc_stats['l2_io_error']) l2_access_total = int(arc_stats['l2_hits'])+int(arc_stats['l2_misses']) health = 'HEALTHY' if l2_errors > 0: health = 'DEGRADED' prt_1('L2ARC status:', health) l2_todo = (('Low memory aborts:', 'l2_abort_lowmem'), ('Free on write:', 'l2_free_on_write'), ('R/W clashes:', 'l2_rw_clash'), ('Bad checksums:', 'l2_cksum_bad'), ('Read errors:', 'l2_io_error'), ('Write errors:', 'l2_writes_error')) for title, value in l2_todo: prt_i1(title, f_hits(arc_stats[value])) print() prt_1('L2ARC size (adaptive):', f_bytes(arc_stats['l2_size'])) prt_i2('Compressed:', f_perc(arc_stats['l2_asize'], arc_stats['l2_size']), f_bytes(arc_stats['l2_asize'])) prt_i2('Header size:', f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']), f_bytes(arc_stats['l2_hdr_size'])) prt_i2('MFU allocated size:', f_perc(arc_stats['l2_mfu_asize'], arc_stats['l2_asize']), f_bytes(arc_stats['l2_mfu_asize'])) prt_i2('MRU allocated size:', f_perc(arc_stats['l2_mru_asize'], arc_stats['l2_asize']), f_bytes(arc_stats['l2_mru_asize'])) prt_i2('Prefetch allocated size:', f_perc(arc_stats['l2_prefetch_asize'], arc_stats['l2_asize']), f_bytes(arc_stats['l2_prefetch_asize'])) prt_i2('Data (buffer content) allocated size:', f_perc(arc_stats['l2_bufc_data_asize'], arc_stats['l2_asize']), f_bytes(arc_stats['l2_bufc_data_asize'])) prt_i2('Metadata (buffer content) allocated size:', f_perc(arc_stats['l2_bufc_metadata_asize'], arc_stats['l2_asize']), f_bytes(arc_stats['l2_bufc_metadata_asize'])) print() prt_1('L2ARC breakdown:', f_hits(l2_access_total)) prt_i2('Hit ratio:', f_perc(arc_stats['l2_hits'], l2_access_total), f_hits(arc_stats['l2_hits'])) prt_i2('Miss ratio:', f_perc(arc_stats['l2_misses'], l2_access_total), f_hits(arc_stats['l2_misses'])) print() print('L2ARC I/O:') prt_i2('Reads:', f_bytes(arc_stats['l2_read_bytes']), f_hits(arc_stats['l2_hits'])) prt_i2('Writes:', f_bytes(arc_stats['l2_write_bytes']), f_hits(arc_stats['l2_writes_sent'])) print() print('L2ARC evicts:') prt_i1('L1 cached:', f_hits(arc_stats['l2_evict_l1cached'])) prt_i1('While reading:', f_hits(arc_stats['l2_evict_reading'])) print() def section_spl(*_): """Print the SPL parameters, if requested with alternative format and/or descriptions. This does not use kstats. """ if sys.platform.startswith('freebsd'): # No SPL support in FreeBSD return spls = get_spl_params() keylist = sorted(spls.keys()) print('Solaris Porting Layer (SPL):') if ARGS.desc: descriptions = get_descriptions('spl') for key in keylist: value = spls[key] if ARGS.desc: try: print(INDENT+'#', descriptions[key]) except KeyError: print(INDENT+'# (No description found)') # paranoid print(format_raw_line(key, value)) print() def section_tunables(*_): """Print the tunables, if requested with alternative format and/or descriptions. This does not use kstasts. """ tunables = get_tunable_params() keylist = sorted(tunables.keys()) print('Tunables:') if ARGS.desc: descriptions = get_descriptions('zfs') for key in keylist: value = tunables[key] if ARGS.desc: try: print(INDENT+'#', descriptions[key]) except KeyError: print(INDENT+'# (No description found)') # paranoid print(format_raw_line(key, value)) print() def section_zil(kstats_dict): """Collect information on the ZFS Intent Log. Some of the information taken from https://github.com/openzfs/zfs/blob/master/include/sys/zil.h """ zil_stats = isolate_section('zil', kstats_dict) prt_1('ZIL committed transactions:', f_hits(zil_stats['zil_itx_count'])) prt_i1('Commit requests:', f_hits(zil_stats['zil_commit_count'])) prt_i1('Flushes to stable storage:', f_hits(zil_stats['zil_commit_writer_count'])) prt_i2('Transactions to SLOG storage pool:', f_bytes(zil_stats['zil_itx_metaslab_slog_bytes']), f_hits(zil_stats['zil_itx_metaslab_slog_count'])) prt_i2('Transactions to non-SLOG storage pool:', f_bytes(zil_stats['zil_itx_metaslab_normal_bytes']), f_hits(zil_stats['zil_itx_metaslab_normal_count'])) print() section_calls = {'arc': section_arc, 'archits': section_archits, 'dmu': section_dmu, 'l2arc': section_l2arc, 'spl': section_spl, 'tunables': section_tunables, 'zil': section_zil} def main(): """Run program. The options to draw a graph and to print all data raw are treated separately because they come with their own call. """ kstats = get_kstats() if ARGS.graph: draw_graph(kstats) sys.exit(0) print_header() if ARGS.raw: print_raw(kstats) elif ARGS.section: try: section_calls[ARGS.section](kstats) except KeyError: print('Error: Section "{0}" unknown'.format(ARGS.section)) sys.exit(1) elif ARGS.page: print('WARNING: Pages are deprecated, please use "--section"\n') pages_to_calls = {1: 'arc', 2: 'archits', 3: 'l2arc', 4: 'dmu', 5: 'vdev', 6: 'tunables'} try: call = pages_to_calls[ARGS.page] except KeyError: print('Error: Page "{0}" not supported'.format(ARGS.page)) sys.exit(1) else: section_calls[call](kstats) else: # If no parameters were given, we print all sections. We might want to # change the sequence by hand calls = sorted(section_calls.keys()) for section in calls: section_calls[section](kstats) sys.exit(0) if __name__ == '__main__': main() diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c index 2f040ff7582c..9636c99fc85f 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c @@ -1,1308 +1,1366 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016, 2017, Intel Corporation. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2023, Klara Inc. */ /* * ZFS syseventd module. * * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c * * The purpose of this module is to identify when devices are added to the * system, and appropriately online or replace the affected vdevs. * * When a device is added to the system: * * 1. Search for any vdevs whose devid matches that of the newly added * device. * * 2. If no vdevs are found, then search for any vdevs whose udev path * matches that of the new device. * * 3. If no vdevs match by either method, then ignore the event. * * 4. Attempt to online the device with a flag to indicate that it should * be unspared when resilvering completes. If this succeeds, then the * same device was inserted and we should continue normally. * * 5. If the pool does not have the 'autoreplace' property set, attempt to * online the device again without the unspare flag, which will * generate a FMA fault. * * 6. If the pool has the 'autoreplace' property set, and the matching vdev * is a whole disk, then label the new disk and attempt a 'zpool * replace'. * * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK * event indicates that a device failed to open during pool load, but the * autoreplace property was set. In this case, we deferred the associated * FMA fault until our module had a chance to process the autoreplace logic. * If the device could not be replaced, then the second online attempt will * trigger the FMA fault that we skipped earlier. * * On Linux udev provides a disk insert for both the disk and the partition. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_agents.h" #include "../zed_log.h" #define DEV_BYID_PATH "/dev/disk/by-id/" #define DEV_BYPATH_PATH "/dev/disk/by-path/" #define DEV_BYVDEV_PATH "/dev/disk/by-vdev/" typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); libzfs_handle_t *g_zfshdl; list_t g_pool_list; /* list of unavailable pools at initialization */ list_t g_device_list; /* list of disks with asynchronous label request */ tpool_t *g_tpool; boolean_t g_enumeration_done; pthread_t g_zfs_tid; /* zfs_enum_pools() thread */ typedef struct unavailpool { zpool_handle_t *uap_zhp; list_node_t uap_node; } unavailpool_t; typedef struct pendingdev { char pd_physpath[128]; list_node_t pd_node; } pendingdev_t; static int zfs_toplevel_state(zpool_handle_t *zhp) { nvlist_t *nvroot; vdev_stat_t *vs; unsigned int c; verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); return (vs->vs_state); } static int zfs_unavail_pool(zpool_handle_t *zhp, void *data) { zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { unavailpool_t *uap; uap = malloc(sizeof (unavailpool_t)); if (uap == NULL) { perror("malloc"); exit(EXIT_FAILURE); } uap->uap_zhp = zhp; list_insert_tail((list_t *)data, uap); } else { zpool_close(zhp); } return (0); } +/* + * Write an array of strings to the zed log + */ +static void lines_to_zed_log_msg(char **lines, int lines_cnt) +{ + int i; + for (i = 0; i < lines_cnt; i++) { + zed_log_msg(LOG_INFO, "%s", lines[i]); + } +} + /* * Two stage replace on Linux * since we get disk notifications * we can wait for partitioned disk slice to show up! * * First stage tags the disk, initiates async partitioning, and returns * Second stage finds the tag and proceeds to ZFS labeling/replace * * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach * * 1. physical match with no fs, no partition * tag it top, partition disk * * 2. physical match again, see partition and tag * */ /* * The device associated with the given vdev (either by devid or physical path) * has been added to the system. If 'isdisk' is set, then we only attempt a * replacement if it's a whole disk. This also implies that we should label the * disk first. * * First, we attempt to online the device (making sure to undo any spare * operation when finished). If this succeeds, then we're done. If it fails, * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, * but that the label was not what we expected. If the 'autoreplace' property * is enabled, then we relabel the disk (if specified), and attempt a 'zpool * replace'. If the online is successful, but the new state is something else * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of * race, and we should avoid attempting to relabel the disk. * * Also can arrive here from a ESC_ZFS_VDEV_CHECK event */ static void zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) { const char *path; vdev_state_t newstate; nvlist_t *nvroot, *newvd; pendingdev_t *device; uint64_t wholedisk = 0ULL; uint64_t offline = 0ULL, faulted = 0ULL; uint64_t guid = 0ULL; uint64_t is_spare = 0; const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; char rawpath[PATH_MAX], fullpath[PATH_MAX]; - char devpath[PATH_MAX]; + char pathbuf[PATH_MAX]; int ret; int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; boolean_t is_sd = B_FALSE; boolean_t is_mpath_wholedisk = B_FALSE; uint_t c; vdev_stat_t *vs; + char **lines = NULL; + int lines_cnt = 0; + /* + * Get the persistent path, typically under the '/dev/disk/by-id' or + * '/dev/disk/by-vdev' directories. Note that this path can change + * when a vdev is replaced with a new disk. + */ if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) return; /* Skip healthy disks */ verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (vs->vs_state == VDEV_STATE_HEALTHY) { zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.", __func__, path); return; } (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &enc_sysfs_path); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_IS_SPARE, &is_spare); /* * Special case: * * We've seen times where a disk won't have a ZPOOL_CONFIG_PHYS_PATH * entry in their config. For example, on this force-faulted disk: * * children[0]: * type: 'disk' * id: 0 * guid: 14309659774640089719 * path: '/dev/disk/by-vdev/L28' * whole_disk: 0 * DTL: 654 * create_txg: 4 * com.delphix:vdev_zap_leaf: 1161 * faulted: 1 * aux_state: 'external' * children[1]: * type: 'disk' * id: 1 * guid: 16002508084177980912 * path: '/dev/disk/by-vdev/L29' * devid: 'dm-uuid-mpath-35000c500a61d68a3' * phys_path: 'L29' * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' * whole_disk: 0 * DTL: 1028 * create_txg: 4 * com.delphix:vdev_zap_leaf: 131 * * If the disk's path is a /dev/disk/by-vdev/ path, then we can infer * the ZPOOL_CONFIG_PHYS_PATH from the by-vdev disk name. */ if (physpath == NULL && path != NULL) { /* If path begins with "/dev/disk/by-vdev/" ... */ if (strncmp(path, DEV_BYVDEV_PATH, strlen(DEV_BYVDEV_PATH)) == 0) { /* Set physpath to the char after "/dev/disk/by-vdev" */ physpath = &path[strlen(DEV_BYVDEV_PATH)]; } } /* * We don't want to autoreplace offlined disks. However, we do want to * replace force-faulted disks (`zpool offline -f`). Force-faulted * disks have both offline=1 and faulted=1 in the nvlist. */ if (offline && !faulted) { zed_log_msg(LOG_INFO, "%s: %s is offline, skip autoreplace", __func__, path); return; } is_mpath_wholedisk = is_mpath_whole_disk(path); zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" " %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', " "(guid %llu)", zpool_get_name(zhp), path, physpath ? physpath : "NULL", wholedisk ? "is" : "not", is_mpath_wholedisk? "is" : "not", labeled ? "is" : "not", enc_sysfs_path, (long long unsigned int)guid); /* * The VDEV guid is preferred for identification (gets passed in path) */ if (guid != 0) { (void) snprintf(fullpath, sizeof (fullpath), "%llu", (long long unsigned int)guid); } else { /* * otherwise use path sans partition suffix for whole disks */ (void) strlcpy(fullpath, path, sizeof (fullpath)); if (wholedisk) { char *spath = zfs_strip_partition(fullpath); if (!spath) { zed_log_msg(LOG_INFO, "%s: Can't alloc", __func__); return; } (void) strlcpy(fullpath, spath, sizeof (fullpath)); free(spath); } } if (is_spare) online_flag |= ZFS_ONLINE_SPARE; /* * Attempt to online the device. */ if (zpool_vdev_online(zhp, fullpath, online_flag, &newstate) == 0 && (newstate == VDEV_STATE_HEALTHY || newstate == VDEV_STATE_DEGRADED)) { zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev '%s' ('%s') is " "%s", fullpath, physpath, (newstate == VDEV_STATE_HEALTHY) ? "HEALTHY" : "DEGRADED"); return; } /* * vdev_id alias rule for using scsi_debug devices (FMA automated * testing) */ if (physpath != NULL && strcmp("scsidebug", physpath) == 0) is_sd = B_TRUE; /* * If the pool doesn't have the autoreplace property set, then use * vdev online to trigger a FMA fault by posting an ereport. */ if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || !(wholedisk || is_mpath_wholedisk) || (physpath == NULL)) { (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " "not a blank disk for '%s' ('%s')", fullpath, physpath); return; } /* * Convert physical path into its current device node. Rawpath * needs to be /dev/disk/by-vdev for a scsi_debug device since * /dev/disk/by-path will not be present. */ (void) snprintf(rawpath, sizeof (rawpath), "%s%s", is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); - if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) { + if (realpath(rawpath, pathbuf) == NULL && !is_mpath_wholedisk) { zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", rawpath, strerror(errno)); - (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, - &newstate); + int err = zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); - zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", - fullpath, libzfs_error_description(g_zfshdl)); + zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s) " + "err %d, new state %d", + fullpath, libzfs_error_description(g_zfshdl), err, + err ? (int)newstate : 0); return; } /* Only autoreplace bad disks */ if ((vs->vs_state != VDEV_STATE_DEGRADED) && (vs->vs_state != VDEV_STATE_FAULTED) && (vs->vs_state != VDEV_STATE_REMOVED) && (vs->vs_state != VDEV_STATE_CANT_OPEN)) { zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in " "a bad state (currently %llu)", vs->vs_state); return; } nvlist_lookup_string(vdev, "new_devid", &new_devid); if (is_mpath_wholedisk) { /* Don't label device mapper or multipath disks. */ + zed_log_msg(LOG_INFO, + " it's a multipath wholedisk, don't label"); + if (zpool_prepare_disk(zhp, vdev, "autoreplace", &lines, + &lines_cnt) != 0) { + zed_log_msg(LOG_INFO, + " zpool_prepare_disk: could not " + "prepare '%s' (%s)", fullpath, + libzfs_error_description(g_zfshdl)); + if (lines_cnt > 0) { + zed_log_msg(LOG_INFO, + " zfs_prepare_disk output:"); + lines_to_zed_log_msg(lines, lines_cnt); + } + libzfs_free_str_array(lines, lines_cnt); + return; + } } else if (!labeled) { /* * we're auto-replacing a raw disk, so label it first */ char *leafname; /* * If this is a request to label a whole disk, then attempt to * write out the label. Before we can label the disk, we need * to map the physical string that was matched on to the under * lying device node. * * If any part of this process fails, then do a force online * to trigger a ZFS fault for the device (and any hot spare * replacement). */ - leafname = strrchr(devpath, '/') + 1; + leafname = strrchr(pathbuf, '/') + 1; /* * If this is a request to label a whole disk, then attempt to * write out the label. */ - if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { - zed_log_msg(LOG_INFO, " zpool_label_disk: could not " + if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname, + vdev, "autoreplace", &lines, &lines_cnt) != 0) { + zed_log_msg(LOG_WARNING, + " zpool_prepare_and_label_disk: could not " "label '%s' (%s)", leafname, libzfs_error_description(g_zfshdl)); + if (lines_cnt > 0) { + zed_log_msg(LOG_INFO, + " zfs_prepare_disk output:"); + lines_to_zed_log_msg(lines, lines_cnt); + } + libzfs_free_str_array(lines, lines_cnt); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); return; } /* * The disk labeling is asynchronous on Linux. Just record * this label request and return as there will be another * disk add event for the partition after the labeling is * completed. */ device = malloc(sizeof (pendingdev_t)); if (device == NULL) { perror("malloc"); exit(EXIT_FAILURE); } (void) strlcpy(device->pd_physpath, physpath, sizeof (device->pd_physpath)); list_insert_tail(&g_device_list, device); - zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", + zed_log_msg(LOG_NOTICE, " zpool_label_disk: async '%s' (%llu)", leafname, (u_longlong_t)guid); return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ } else /* labeled */ { boolean_t found = B_FALSE; /* * match up with request above to label the disk */ for (device = list_head(&g_device_list); device != NULL; device = list_next(&g_device_list, device)) { if (strcmp(physpath, device->pd_physpath) == 0) { list_remove(&g_device_list, device); free(device); found = B_TRUE; break; } zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", physpath, device->pd_physpath); } if (!found) { /* unexpected partition slice encountered */ - zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", - fullpath); + zed_log_msg(LOG_WARNING, "labeled disk %s was " + "unexpected here", fullpath); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); return; } zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", physpath, (u_longlong_t)guid); - (void) snprintf(devpath, sizeof (devpath), "%s%s", - DEV_BYID_PATH, new_devid); + /* + * Paths that begin with '/dev/disk/by-id/' will change and so + * they must be updated before calling zpool_vdev_attach(). + */ + if (strncmp(path, DEV_BYID_PATH, strlen(DEV_BYID_PATH)) == 0) { + (void) snprintf(pathbuf, sizeof (pathbuf), "%s%s", + DEV_BYID_PATH, new_devid); + zed_log_msg(LOG_INFO, " zpool_label_disk: path '%s' " + "replaced by '%s'", path, pathbuf); + path = pathbuf; + } } + libzfs_free_str_array(lines, lines_cnt); + /* * Construct the root vdev to pass to zpool_vdev_attach(). While adding * the entire vdev structure is harmless, we construct a reduced set of * path/physpath/wholedisk to keep it simple. */ if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); return; } if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); nvlist_free(nvroot); return; } if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || (physpath != NULL && nvlist_add_string(newvd, ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || (enc_sysfs_path != NULL && nvlist_add_string(newvd, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) || nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&newvd, 1) != 0) { zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); nvlist_free(newvd); nvlist_free(nvroot); return; } nvlist_free(newvd); /* * Wait for udev to verify the links exist, then auto-replace * the leaf disk at same physical location. */ - if (zpool_label_disk_wait(path, 3000) != 0) { - zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " - "disk %s is missing", path); + if (zpool_label_disk_wait(path, DISK_LABEL_WAIT) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: pool '%s', after labeling " + "replacement disk, the expected disk partition link '%s' " + "is missing after waiting %u ms", + zpool_get_name(zhp), path, DISK_LABEL_WAIT); nvlist_free(nvroot); return; } /* * Prefer sequential resilvering when supported (mirrors and dRAID), * otherwise fallback to a traditional healing resilver. */ ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE); if (ret != 0) { ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE); } - zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", + zed_log_msg(LOG_WARNING, " zpool_vdev_replace: %s with %s (%s)", fullpath, path, (ret == 0) ? "no errors" : libzfs_error_description(g_zfshdl)); nvlist_free(nvroot); } /* * Utility functions to find a vdev matching given criteria. */ typedef struct dev_data { const char *dd_compare; const char *dd_prop; zfs_process_func_t dd_func; boolean_t dd_found; boolean_t dd_islabeled; uint64_t dd_pool_guid; uint64_t dd_vdev_guid; uint64_t dd_new_vdev_guid; const char *dd_new_devid; uint64_t dd_num_spares; } dev_data_t; static void zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) { dev_data_t *dp = data; const char *path = NULL; uint_t c, children; nvlist_t **child; uint64_t guid = 0; uint64_t isspare = 0; /* * First iterate over any children. */ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) zfs_iter_vdev(zhp, child[c], data); } /* * Iterate over any spares and cache devices */ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) zfs_iter_vdev(zhp, child[c], data); } if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) zfs_iter_vdev(zhp, child[c], data); } /* once a vdev was matched and processed there is nothing left to do */ if (dp->dd_found && dp->dd_num_spares == 0) return; (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &guid); /* * Match by GUID if available otherwise fallback to devid or physical */ if (dp->dd_vdev_guid != 0) { if (guid != dp->dd_vdev_guid) return; zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); dp->dd_found = B_TRUE; } else if (dp->dd_compare != NULL) { /* * NOTE: On Linux there is an event for partition, so unlike * illumos, substring matching is not required to accommodate * the partition suffix. An exact match will be present in * the dp->dd_compare value. * If the attached disk already contains a vdev GUID, it means * the disk is not clean. In such a scenario, the physical path * would be a match that makes the disk faulted when trying to * online it. So, we would only want to proceed if either GUID * matches with the last attached disk or the disk is in clean * state. */ if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || strcmp(dp->dd_compare, path) != 0) { return; } if (dp->dd_new_vdev_guid != 0 && dp->dd_new_vdev_guid != guid) { zed_log_msg(LOG_INFO, " %s: no match (GUID:%llu" " != vdev GUID:%llu)", __func__, dp->dd_new_vdev_guid, guid); return; } zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", dp->dd_prop, path); dp->dd_found = B_TRUE; - /* pass the new devid for use by replacing code */ + /* pass the new devid for use by auto-replacing code */ if (dp->dd_new_devid != NULL) { (void) nvlist_add_string(nvl, "new_devid", dp->dd_new_devid); } } if (dp->dd_found == B_TRUE && nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare) dp->dd_num_spares++; (dp->dd_func)(zhp, nvl, dp->dd_islabeled); } static void zfs_enable_ds(void *arg) { unavailpool_t *pool = (unavailpool_t *)arg; (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); zpool_close(pool->uap_zhp); free(pool); } static int zfs_iter_pool(zpool_handle_t *zhp, void *data) { nvlist_t *config, *nvl; dev_data_t *dp = data; uint64_t pool_guid; unavailpool_t *pool; zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); /* * For each vdev in this pool, look for a match to apply dd_func */ if ((config = zpool_get_config(zhp, NULL)) != NULL) { if (dp->dd_pool_guid == 0 || (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { (void) nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl); zfs_iter_vdev(zhp, nvl, data); } } else { zed_log_msg(LOG_INFO, "%s: no config\n", __func__); } /* * if this pool was originally unavailable, * then enable its datasets asynchronously */ if (g_enumeration_done) { for (pool = list_head(&g_pool_list); pool != NULL; pool = list_next(&g_pool_list, pool)) { if (strcmp(zpool_get_name(zhp), zpool_get_name(pool->uap_zhp))) continue; if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { list_remove(&g_pool_list, pool); (void) tpool_dispatch(g_tpool, zfs_enable_ds, pool); break; } } } zpool_close(zhp); /* cease iteration after a match */ return (dp->dd_found && dp->dd_num_spares == 0); } /* * Given a physical device location, iterate over all * (pool, vdev) pairs which correspond to that location. */ static boolean_t devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, boolean_t is_slice, uint64_t new_vdev_guid) { dev_data_t data = { 0 }; data.dd_compare = physical; data.dd_func = func; data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; data.dd_found = B_FALSE; data.dd_islabeled = is_slice; data.dd_new_devid = devid; /* used by auto replace code */ data.dd_new_vdev_guid = new_vdev_guid; (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); return (data.dd_found); } /* * Given a device identifier, find any vdevs with a matching by-vdev * path. Normally we shouldn't need this as the comparison would be * made earlier in the devphys_iter(). For example, if we were replacing * /dev/disk/by-vdev/L28, normally devphys_iter() would match the * ZPOOL_CONFIG_PHYS_PATH of "L28" from the old disk config to "L28" * of the new disk config. However, we've seen cases where * ZPOOL_CONFIG_PHYS_PATH was not in the config for the old disk. Here's * an example of a real 2-disk mirror pool where one disk was force * faulted: * * com.delphix:vdev_zap_top: 129 * children[0]: * type: 'disk' * id: 0 * guid: 14309659774640089719 * path: '/dev/disk/by-vdev/L28' * whole_disk: 0 * DTL: 654 * create_txg: 4 * com.delphix:vdev_zap_leaf: 1161 * faulted: 1 * aux_state: 'external' * children[1]: * type: 'disk' * id: 1 * guid: 16002508084177980912 * path: '/dev/disk/by-vdev/L29' * devid: 'dm-uuid-mpath-35000c500a61d68a3' * phys_path: 'L29' * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' * whole_disk: 0 * DTL: 1028 * create_txg: 4 * com.delphix:vdev_zap_leaf: 131 * * So in the case above, the only thing we could compare is the path. * * We can do this because we assume by-vdev paths are authoritative as physical * paths. We could not assume this for normal paths like /dev/sda since the * physical location /dev/sda points to could change over time. */ static boolean_t by_vdev_path_iter(const char *by_vdev_path, const char *devid, zfs_process_func_t func, boolean_t is_slice) { dev_data_t data = { 0 }; data.dd_compare = by_vdev_path; data.dd_func = func; data.dd_prop = ZPOOL_CONFIG_PATH; data.dd_found = B_FALSE; data.dd_islabeled = is_slice; data.dd_new_devid = devid; if (strncmp(by_vdev_path, DEV_BYVDEV_PATH, strlen(DEV_BYVDEV_PATH)) != 0) { /* by_vdev_path doesn't start with "/dev/disk/by-vdev/" */ return (B_FALSE); } (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); return (data.dd_found); } /* * Given a device identifier, find any vdevs with a matching devid. * On Linux we can match devid directly which is always a whole disk. */ static boolean_t devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) { dev_data_t data = { 0 }; data.dd_compare = devid; data.dd_func = func; data.dd_prop = ZPOOL_CONFIG_DEVID; data.dd_found = B_FALSE; data.dd_islabeled = is_slice; data.dd_new_devid = devid; (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); return (data.dd_found); } /* * Given a device guid, find any vdevs with a matching guid. */ static boolean_t guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid, zfs_process_func_t func, boolean_t is_slice) { dev_data_t data = { 0 }; data.dd_func = func; data.dd_found = B_FALSE; data.dd_pool_guid = pool_guid; data.dd_vdev_guid = vdev_guid; data.dd_islabeled = is_slice; data.dd_new_devid = devid; (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); return (data.dd_found); } /* * Handle a EC_DEV_ADD.ESC_DISK event. * * illumos * Expects: DEV_PHYS_PATH string in schema * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID * * path: '/dev/dsk/c0t1d0s0' (persistent) * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a' * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a' * * linux * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID * * path: '/dev/sdc1' (not persistent) * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1' * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0' */ static int zfs_deliver_add(nvlist_t *nvl) { const char *devpath = NULL, *devid = NULL; uint64_t pool_guid = 0, vdev_guid = 0; boolean_t is_slice; /* * Expecting a devid string and an optional physical location and guid */ if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) { zed_log_msg(LOG_INFO, "%s: no dev identifier\n", __func__); return (-1); } (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)", devid, devpath ? devpath : "NULL", is_slice); /* * Iterate over all vdevs looking for a match in the following order: * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). * 3. ZPOOL_CONFIG_GUID (identifies unique vdev). * 4. ZPOOL_CONFIG_PATH for /dev/disk/by-vdev devices only (since * by-vdev paths represent physical paths). */ if (devid_iter(devid, zfs_process_add, is_slice)) return (0); if (devpath != NULL && devphys_iter(devpath, devid, zfs_process_add, is_slice, vdev_guid)) return (0); if (vdev_guid != 0) (void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add, is_slice); if (devpath != NULL) { /* Can we match a /dev/disk/by-vdev/ path? */ char by_vdev_path[MAXPATHLEN]; snprintf(by_vdev_path, sizeof (by_vdev_path), "/dev/disk/by-vdev/%s", devpath); if (by_vdev_path_iter(by_vdev_path, devid, zfs_process_add, is_slice)) return (0); } return (0); } /* * Called when we receive a VDEV_CHECK event, which indicates a device could not * be opened during initial pool open, but the autoreplace property was set on * the pool. In this case, we treat it as if it were an add event. */ static int zfs_deliver_check(nvlist_t *nvl) { dev_data_t data = { 0 }; if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &data.dd_pool_guid) != 0 || nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &data.dd_vdev_guid) != 0 || data.dd_vdev_guid == 0) return (0); zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu", data.dd_pool_guid, data.dd_vdev_guid); data.dd_func = zfs_process_add; (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); return (0); } /* * Given a path to a vdev, lookup the vdev's physical size from its * config nvlist. * * Returns the vdev's physical size in bytes on success, 0 on error. */ static uint64_t vdev_size_from_config(zpool_handle_t *zhp, const char *vdev_path) { nvlist_t *nvl = NULL; boolean_t avail_spare, l2cache, log; vdev_stat_t *vs = NULL; uint_t c; nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); if (!nvl) return (0); verify(nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (!vs) { zed_log_msg(LOG_INFO, "%s: no nvlist for '%s'", __func__, vdev_path); return (0); } return (vs->vs_pspace); } /* * Given a path to a vdev, lookup if the vdev is a "whole disk" in the * config nvlist. "whole disk" means that ZFS was passed a whole disk * at pool creation time, which it partitioned up and has full control over. * Thus a partition with wholedisk=1 set tells us that zfs created the * partition at creation time. A partition without whole disk set would have * been created by externally (like with fdisk) and passed to ZFS. * * Returns the whole disk value (either 0 or 1). */ static uint64_t vdev_whole_disk_from_config(zpool_handle_t *zhp, const char *vdev_path) { nvlist_t *nvl = NULL; boolean_t avail_spare, l2cache, log; uint64_t wholedisk = 0; nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); if (!nvl) return (0); (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); return (wholedisk); } /* * If the device size grew more than 1% then return true. */ #define DEVICE_GREW(oldsize, newsize) \ ((newsize > oldsize) && \ ((newsize / (newsize - oldsize)) <= 100)) static int zfsdle_vdev_online(zpool_handle_t *zhp, void *data) { boolean_t avail_spare, l2cache; nvlist_t *udev_nvl = data; nvlist_t *tgt; int error; const char *tmp_devname; char devname[MAXPATHLEN] = ""; uint64_t guid; if (nvlist_lookup_uint64(udev_nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { sprintf(devname, "%llu", (u_longlong_t)guid); } else if (nvlist_lookup_string(udev_nvl, DEV_PHYS_PATH, &tmp_devname) == 0) { strlcpy(devname, tmp_devname, MAXPATHLEN); zfs_append_partition(devname, MAXPATHLEN); } else { zed_log_msg(LOG_INFO, "%s: no guid or physpath", __func__); } zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", devname, zpool_get_name(zhp)); if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, &avail_spare, &l2cache, NULL)) != NULL) { const char *path; char fullpath[MAXPATHLEN]; uint64_t wholedisk = 0; error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path); if (error) { zpool_close(zhp); return (0); } (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); if (wholedisk) { char *tmp; path = strrchr(path, '/'); if (path != NULL) { tmp = zfs_strip_partition(path + 1); if (tmp == NULL) { zpool_close(zhp); return (0); } } else { zpool_close(zhp); return (0); } (void) strlcpy(fullpath, tmp, sizeof (fullpath)); free(tmp); /* * We need to reopen the pool associated with this * device so that the kernel can update the size of * the expanded device. When expanding there is no * need to restart the scrub from the beginning. */ boolean_t scrub_restart = B_FALSE; (void) zpool_reopen_one(zhp, &scrub_restart); } else { (void) strlcpy(fullpath, path, sizeof (fullpath)); } if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { vdev_state_t newstate; if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { /* * If this disk size has not changed, then * there's no need to do an autoexpand. To * check we look at the disk's size in its * config, and compare it to the disk size * that udev is reporting. */ uint64_t udev_size = 0, conf_size = 0, wholedisk = 0, udev_parent_size = 0; /* * Get the size of our disk that udev is * reporting. */ if (nvlist_lookup_uint64(udev_nvl, DEV_SIZE, &udev_size) != 0) { udev_size = 0; } /* * Get the size of our disk's parent device * from udev (where sda1's parent is sda). */ if (nvlist_lookup_uint64(udev_nvl, DEV_PARENT_SIZE, &udev_parent_size) != 0) { udev_parent_size = 0; } conf_size = vdev_size_from_config(zhp, fullpath); wholedisk = vdev_whole_disk_from_config(zhp, fullpath); /* * Only attempt an autoexpand if the vdev size * changed. There are two different cases * to consider. * * 1. wholedisk=1 * If you do a 'zpool create' on a whole disk * (like /dev/sda), then zfs will create * partitions on the disk (like /dev/sda1). In * that case, wholedisk=1 will be set in the * partition's nvlist config. So zed will need * to see if your parent device (/dev/sda) * expanded in size, and if so, then attempt * the autoexpand. * * 2. wholedisk=0 * If you do a 'zpool create' on an existing * partition, or a device that doesn't allow * partitions, then wholedisk=0, and you will * simply need to check if the device itself * expanded in size. */ if (DEVICE_GREW(conf_size, udev_size) || (wholedisk && DEVICE_GREW(conf_size, udev_parent_size))) { error = zpool_vdev_online(zhp, fullpath, 0, &newstate); zed_log_msg(LOG_INFO, "%s: autoexpanding '%s' from %llu" " to %llu bytes in pool '%s': %d", __func__, fullpath, conf_size, MAX(udev_size, udev_parent_size), zpool_get_name(zhp), error); } } } zpool_close(zhp); return (1); } zpool_close(zhp); return (0); } /* * This function handles the ESC_DEV_DLE device change event. Use the * provided vdev guid when looking up a disk or partition, when the guid * is not present assume the entire disk is owned by ZFS and append the * expected -part1 partition information then lookup by physical path. */ static int zfs_deliver_dle(nvlist_t *nvl) { const char *devname; char name[MAXPATHLEN]; uint64_t guid; if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { sprintf(name, "%llu", (u_longlong_t)guid); } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) { strlcpy(name, devname, MAXPATHLEN); zfs_append_partition(name, MAXPATHLEN); } else { sprintf(name, "unknown"); zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); } if (zpool_iter(g_zfshdl, zfsdle_vdev_online, nvl) != 1) { zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " "found", name); return (1); } return (0); } /* * syseventd daemon module event handler * * Handles syseventd daemon zfs device related events: * * EC_DEV_ADD.ESC_DISK * EC_DEV_STATUS.ESC_DEV_DLE * EC_ZFS.ESC_ZFS_VDEV_CHECK * * Note: assumes only one thread active at a time (not thread safe) */ static int zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl) { int ret; boolean_t is_check = B_FALSE, is_dle = B_FALSE; if (strcmp(class, EC_DEV_ADD) == 0) { /* * We're mainly interested in disk additions, but we also listen * for new loop devices, to allow for simplified testing. */ if (strcmp(subclass, ESC_DISK) != 0 && strcmp(subclass, ESC_LOFI) != 0) return (0); is_check = B_FALSE; } else if (strcmp(class, EC_ZFS) == 0 && strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { /* * This event signifies that a device failed to open * during pool load, but the 'autoreplace' property was * set, so we should pretend it's just been added. */ is_check = B_TRUE; } else if (strcmp(class, EC_DEV_STATUS) == 0 && strcmp(subclass, ESC_DEV_DLE) == 0) { is_dle = B_TRUE; } else { return (0); } if (is_dle) ret = zfs_deliver_dle(nvl); else if (is_check) ret = zfs_deliver_check(nvl); else ret = zfs_deliver_add(nvl); return (ret); } static void * zfs_enum_pools(void *arg) { (void) arg; (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); /* * Linux - instead of using a thread pool, each list entry * will spawn a thread when an unavailable pool transitions * to available. zfs_slm_fini will wait for these threads. */ g_enumeration_done = B_TRUE; return (NULL); } /* * called from zed daemon at startup * * sent messages from zevents or udev monitor * * For now, each agent has its own libzfs instance */ int zfs_slm_init(void) { if ((g_zfshdl = libzfs_init()) == NULL) return (-1); /* * collect a list of unavailable pools (asynchronously, * since this can take a while) */ list_create(&g_pool_list, sizeof (struct unavailpool), offsetof(struct unavailpool, uap_node)); if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { list_destroy(&g_pool_list); libzfs_fini(g_zfshdl); return (-1); } pthread_setname_np(g_zfs_tid, "enum-pools"); list_create(&g_device_list, sizeof (struct pendingdev), offsetof(struct pendingdev, pd_node)); return (0); } void zfs_slm_fini(void) { unavailpool_t *pool; pendingdev_t *device; /* wait for zfs_enum_pools thread to complete */ (void) pthread_join(g_zfs_tid, NULL); /* destroy the thread pool */ if (g_tpool != NULL) { tpool_wait(g_tpool); tpool_destroy(g_tpool); } while ((pool = list_remove_head(&g_pool_list)) != NULL) { zpool_close(pool->uap_zhp); free(pool); } list_destroy(&g_pool_list); while ((device = list_remove_head(&g_device_list)) != NULL) free(device); list_destroy(&g_device_list); libzfs_fini(g_zfshdl); } void zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) { zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); (void) zfs_slm_deliver_event(class, subclass, nvl); } diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 index fec73a269a78..6d60e643593b 100644 --- a/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 +++ b/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 @@ -1,14 +1,23 @@ # Features which are supported by GRUB2 +allocation_classes async_destroy +block_cloning bookmarks +device_rebuild embedded_data empty_bpobj enabled_txg extensible_dataset filesystem_limits hole_birth large_blocks livelist +log_spacemap lz4_compress +project_quota +resilver_defer spacemap_histogram +spacemap_v2 +userobj_accounting +zilsaxattr zpool_checkpoint diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 index 162ff32a7803..125c578344f9 100644 --- a/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 +++ b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfsonosx-1.8.1 @@ -1,21 +1,20 @@ # Features supported by OpenZFSonOSX 1.8.1 async_destroy bookmarks device_removal edonr embedded_data empty_bpobj enabled_txg -encryption extensible_dataset filesystem_limits hole_birth large_blocks lz4_compress multi_vdev_crash_dump obsolete_counts sha512 skein spacemap_histogram spacemap_v2 zpool_checkpoint diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c index 7c6549b0ae54..506b529dce48 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c @@ -1,726 +1,707 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2016 Igor Kozhukhov . */ #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" /* * Private interface for iterating over pools specified on the command line. * Most consumers will call for_each_pool, but in order to support iostat, we * allow fined grained control through the zpool_list_t interface. */ typedef struct zpool_node { zpool_handle_t *zn_handle; uu_avl_node_t zn_avlnode; int zn_mark; } zpool_node_t; struct zpool_list { boolean_t zl_findall; boolean_t zl_literal; uu_avl_t *zl_avl; uu_avl_pool_t *zl_pool; zprop_list_t **zl_proplist; zfs_type_t zl_type; }; static int zpool_compare(const void *larg, const void *rarg, void *unused) { (void) unused; zpool_handle_t *l = ((zpool_node_t *)larg)->zn_handle; zpool_handle_t *r = ((zpool_node_t *)rarg)->zn_handle; const char *lname = zpool_get_name(l); const char *rname = zpool_get_name(r); return (strcmp(lname, rname)); } /* * Callback function for pool_list_get(). Adds the given pool to the AVL tree * of known pools. */ static int add_pool(zpool_handle_t *zhp, void *data) { zpool_list_t *zlp = data; zpool_node_t *node = safe_malloc(sizeof (zpool_node_t)); uu_avl_index_t idx; node->zn_handle = zhp; uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { if (zlp->zl_proplist && zpool_expand_proplist(zhp, zlp->zl_proplist, zlp->zl_type, zlp->zl_literal) != 0) { zpool_close(zhp); free(node); return (-1); } uu_avl_insert(zlp->zl_avl, node, idx); } else { zpool_close(zhp); free(node); return (-1); } return (0); } /* * Create a list of pools based on the given arguments. If we're given no * arguments, then iterate over all pools in the system and add them to the AVL * tree. Otherwise, add only those pool explicitly specified on the command * line. */ zpool_list_t * pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, boolean_t literal, int *err) { zpool_list_t *zlp; zlp = safe_malloc(sizeof (zpool_list_t)); zlp->zl_pool = uu_avl_pool_create("zfs_pool", sizeof (zpool_node_t), offsetof(zpool_node_t, zn_avlnode), zpool_compare, UU_DEFAULT); if (zlp->zl_pool == NULL) zpool_no_memory(); if ((zlp->zl_avl = uu_avl_create(zlp->zl_pool, NULL, UU_DEFAULT)) == NULL) zpool_no_memory(); zlp->zl_proplist = proplist; zlp->zl_type = type; zlp->zl_literal = literal; if (argc == 0) { (void) zpool_iter(g_zfs, add_pool, zlp); zlp->zl_findall = B_TRUE; } else { int i; for (i = 0; i < argc; i++) { zpool_handle_t *zhp; if ((zhp = zpool_open_canfail(g_zfs, argv[i])) != NULL) { if (add_pool(zhp, zlp) != 0) *err = B_TRUE; } else { *err = B_TRUE; } } } return (zlp); } /* * Search for any new pools, adding them to the list. We only add pools when no * options were given on the command line. Otherwise, we keep the list fixed as * those that were explicitly specified. */ void pool_list_update(zpool_list_t *zlp) { if (zlp->zl_findall) (void) zpool_iter(g_zfs, add_pool, zlp); } /* * Iterate over all pools in the list, executing the callback for each */ int pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func, void *data) { zpool_node_t *node, *next_node; int ret = 0; for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next_node) { next_node = uu_avl_next(zlp->zl_avl, node); if (zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL || unavail) ret |= func(node->zn_handle, data); } return (ret); } /* * Remove the given pool from the list. When running iostat, we want to remove * those pools that no longer exist. */ void pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp) { zpool_node_t search, *node; search.zn_handle = zhp; if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) { uu_avl_remove(zlp->zl_avl, node); zpool_close(node->zn_handle); free(node); } } /* * Free all the handles associated with this list. */ void pool_list_free(zpool_list_t *zlp) { uu_avl_walk_t *walk; zpool_node_t *node; if ((walk = uu_avl_walk_start(zlp->zl_avl, UU_WALK_ROBUST)) == NULL) { (void) fprintf(stderr, gettext("internal error: out of memory")); exit(1); } while ((node = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(zlp->zl_avl, node); zpool_close(node->zn_handle); free(node); } uu_avl_walk_end(walk); uu_avl_destroy(zlp->zl_avl); uu_avl_pool_destroy(zlp->zl_pool); free(zlp); } /* * Returns the number of elements in the pool list. */ int pool_list_count(zpool_list_t *zlp) { return (uu_avl_numnodes(zlp->zl_avl)); } /* * High level function which iterates over all pools given on the command line, * using the pool_list_* interfaces. */ int for_each_pool(int argc, char **argv, boolean_t unavail, zprop_list_t **proplist, zfs_type_t type, boolean_t literal, zpool_iter_f func, void *data) { zpool_list_t *list; int ret = 0; if ((list = pool_list_get(argc, argv, proplist, type, literal, &ret)) == NULL) return (1); if (pool_list_iter(list, unavail, func, data) != 0) ret = 1; pool_list_free(list); return (ret); } /* * This is the equivalent of for_each_pool() for vdevs. It iterates thorough * all vdevs in the pool, ignoring root vdevs and holes, calling func() on * each one. * * @zhp: Zpool handle * @func: Function to call on each vdev * @data: Custom data to pass to the function */ int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data) { nvlist_t *config, *nvroot = NULL; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); } return (for_each_vdev_cb((void *) zhp, nvroot, func, data)); } /* * Process the vcdl->vdev_cmd_data[] array to figure out all the unique column * names and their widths. When this function is done, vcdl->uniq_cols, * vcdl->uniq_cols_cnt, and vcdl->uniq_cols_width will be filled in. */ static void process_unique_cmd_columns(vdev_cmd_data_list_t *vcdl) { char **uniq_cols = NULL, **tmp = NULL; int *uniq_cols_width; vdev_cmd_data_t *data; int cnt = 0; int k; /* For each vdev */ for (int i = 0; i < vcdl->count; i++) { data = &vcdl->data[i]; /* For each column the vdev reported */ for (int j = 0; j < data->cols_cnt; j++) { /* Is this column in our list of unique column names? */ for (k = 0; k < cnt; k++) { if (strcmp(data->cols[j], uniq_cols[k]) == 0) break; /* yes it is */ } if (k == cnt) { /* No entry for column, add to list */ tmp = realloc(uniq_cols, sizeof (*uniq_cols) * (cnt + 1)); if (tmp == NULL) break; /* Nothing we can do... */ uniq_cols = tmp; uniq_cols[cnt] = data->cols[j]; cnt++; } } } /* * We now have a list of all the unique column names. Figure out the * max width of each column by looking at the column name and all its * values. */ uniq_cols_width = safe_malloc(sizeof (*uniq_cols_width) * cnt); for (int i = 0; i < cnt; i++) { /* Start off with the column title's width */ uniq_cols_width[i] = strlen(uniq_cols[i]); /* For each vdev */ for (int j = 0; j < vcdl->count; j++) { /* For each of the vdev's values in a column */ data = &vcdl->data[j]; for (k = 0; k < data->cols_cnt; k++) { /* Does this vdev have a value for this col? */ if (strcmp(data->cols[k], uniq_cols[i]) == 0) { /* Is the value width larger? */ uniq_cols_width[i] = MAX(uniq_cols_width[i], strlen(data->lines[k])); } } } } vcdl->uniq_cols = uniq_cols; vcdl->uniq_cols_cnt = cnt; vcdl->uniq_cols_width = uniq_cols_width; } /* * Process a line of command output * * When running 'zpool iostat|status -c' the lines of output can either be * in the form of: * * column_name=value * * Or just: * * value * * Process the column_name (if any) and value. * * Returns 0 if line was processed, and there are more lines can still be * processed. * * Returns 1 if this was the last line to process, or error. */ static int vdev_process_cmd_output(vdev_cmd_data_t *data, char *line) { char *col = NULL; char *val = line; char *equals; char **tmp; if (line == NULL) return (1); equals = strchr(line, '='); if (equals != NULL) { /* * We have a 'column=value' type line. Split it into the * column and value strings by turning the '=' into a '\0'. */ *equals = '\0'; col = line; val = equals + 1; } else { val = line; } /* Do we already have a column by this name? If so, skip it. */ if (col != NULL) { for (int i = 0; i < data->cols_cnt; i++) { if (strcmp(col, data->cols[i]) == 0) return (0); /* Duplicate, skip */ } } if (val != NULL) { tmp = realloc(data->lines, (data->lines_cnt + 1) * sizeof (*data->lines)); if (tmp == NULL) return (1); data->lines = tmp; data->lines[data->lines_cnt] = strdup(val); data->lines_cnt++; } if (col != NULL) { tmp = realloc(data->cols, (data->cols_cnt + 1) * sizeof (*data->cols)); if (tmp == NULL) return (1); data->cols = tmp; data->cols[data->cols_cnt] = strdup(col); data->cols_cnt++; } if (val != NULL && col == NULL) return (1); return (0); } /* * Run the cmd and store results in *data. */ static void vdev_run_cmd(vdev_cmd_data_t *data, char *cmd) { int rc; char *argv[2] = {cmd}; - char *env[5] = {(char *)"PATH=/bin:/sbin:/usr/bin:/usr/sbin"}; + char **env; char **lines = NULL; int lines_cnt = 0; int i; - /* Setup our custom environment variables */ - rc = asprintf(&env[1], "VDEV_PATH=%s", - data->path ? data->path : ""); - if (rc == -1) { - env[1] = NULL; + env = zpool_vdev_script_alloc_env(data->pool, data->path, data->upath, + data->vdev_enc_sysfs_path, NULL, NULL); + if (env == NULL) goto out; - } - - rc = asprintf(&env[2], "VDEV_UPATH=%s", - data->upath ? data->upath : ""); - if (rc == -1) { - env[2] = NULL; - goto out; - } - - rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s", - data->vdev_enc_sysfs_path ? - data->vdev_enc_sysfs_path : ""); - if (rc == -1) { - env[3] = NULL; - goto out; - } /* Run the command */ rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines, &lines_cnt); + + zpool_vdev_script_free_env(env); + if (rc != 0) goto out; /* Process the output we got */ for (i = 0; i < lines_cnt; i++) if (vdev_process_cmd_output(data, lines[i]) != 0) break; out: if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); - - /* Start with i = 1 since env[0] was statically allocated */ - for (i = 1; i < ARRAY_SIZE(env); i++) - free(env[i]); } /* * Generate the search path for zpool iostat/status -c scripts. * The string returned must be freed. */ char * zpool_get_cmd_search_path(void) { const char *env; char *sp = NULL; env = getenv("ZPOOL_SCRIPTS_PATH"); if (env != NULL) return (strdup(env)); env = getenv("HOME"); if (env != NULL) { if (asprintf(&sp, "%s/.zpool.d:%s", env, ZPOOL_SCRIPTS_DIR) != -1) { return (sp); } } if (asprintf(&sp, "%s", ZPOOL_SCRIPTS_DIR) != -1) return (sp); return (NULL); } /* Thread function run for each vdev */ static void vdev_run_cmd_thread(void *cb_cmd_data) { vdev_cmd_data_t *data = cb_cmd_data; char *cmd = NULL, *cmddup, *cmdrest; cmddup = strdup(data->cmd); if (cmddup == NULL) return; cmdrest = cmddup; while ((cmd = strtok_r(cmdrest, ",", &cmdrest))) { char *dir = NULL, *sp, *sprest; char fullpath[MAXPATHLEN]; if (strchr(cmd, '/') != NULL) continue; sp = zpool_get_cmd_search_path(); if (sp == NULL) continue; sprest = sp; while ((dir = strtok_r(sprest, ":", &sprest))) { if (snprintf(fullpath, sizeof (fullpath), "%s/%s", dir, cmd) == -1) continue; if (access(fullpath, X_OK) == 0) { vdev_run_cmd(data, fullpath); break; } } free(sp); } free(cmddup); } /* For each vdev in the pool run a command */ static int for_each_vdev_run_cb(void *zhp_data, nvlist_t *nv, void *cb_vcdl) { vdev_cmd_data_list_t *vcdl = cb_vcdl; vdev_cmd_data_t *data; const char *path = NULL; char *vname = NULL; const char *vdev_enc_sysfs_path = NULL; int i, match = 0; zpool_handle_t *zhp = zhp_data; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) return (1); nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &vdev_enc_sysfs_path); /* Spares show more than once if they're in use, so skip if exists */ for (i = 0; i < vcdl->count; i++) { if ((strcmp(vcdl->data[i].path, path) == 0) && (strcmp(vcdl->data[i].pool, zpool_get_name(zhp)) == 0)) { /* vdev already exists, skip it */ return (0); } } /* Check for selected vdevs here, if any */ for (i = 0; i < vcdl->vdev_names_count; i++) { vname = zpool_vdev_name(g_zfs, zhp, nv, vcdl->cb_name_flags); if (strcmp(vcdl->vdev_names[i], vname) == 0) { free(vname); match = 1; break; /* match */ } free(vname); } /* If we selected vdevs, and this isn't one of them, then bail out */ if (!match && vcdl->vdev_names_count) return (0); /* * Resize our array and add in the new element. */ if (!(vcdl->data = realloc(vcdl->data, sizeof (*vcdl->data) * (vcdl->count + 1)))) return (ENOMEM); /* couldn't realloc */ data = &vcdl->data[vcdl->count]; data->pool = strdup(zpool_get_name(zhp)); data->path = strdup(path); data->upath = zfs_get_underlying_path(path); data->cmd = vcdl->cmd; data->lines = data->cols = NULL; data->lines_cnt = data->cols_cnt = 0; if (vdev_enc_sysfs_path) data->vdev_enc_sysfs_path = strdup(vdev_enc_sysfs_path); else data->vdev_enc_sysfs_path = NULL; vcdl->count++; return (0); } /* Get the names and count of the vdevs */ static int all_pools_for_each_vdev_gather_cb(zpool_handle_t *zhp, void *cb_vcdl) { return (for_each_vdev(zhp, for_each_vdev_run_cb, cb_vcdl)); } /* * Now that vcdl is populated with our complete list of vdevs, spawn * off the commands. */ static void all_pools_for_each_vdev_run_vcdl(vdev_cmd_data_list_t *vcdl) { tpool_t *t; t = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL); if (t == NULL) return; /* Spawn off the command for each vdev */ for (int i = 0; i < vcdl->count; i++) { (void) tpool_dispatch(t, vdev_run_cmd_thread, (void *) &vcdl->data[i]); } /* Wait for threads to finish */ tpool_wait(t); tpool_destroy(t); } /* * Run command 'cmd' on all vdevs in all pools in argv. Saves the first line of * output from the command in vcdk->data[].line for all vdevs. If you want * to run the command on only certain vdevs, fill in g_zfs, vdev_names, * vdev_names_count, and cb_name_flags. Otherwise leave them as zero. * * Returns a vdev_cmd_data_list_t that must be freed with * free_vdev_cmd_data_list(); */ vdev_cmd_data_list_t * all_pools_for_each_vdev_run(int argc, char **argv, char *cmd, libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count, int cb_name_flags) { vdev_cmd_data_list_t *vcdl; vcdl = safe_malloc(sizeof (vdev_cmd_data_list_t)); vcdl->cmd = cmd; vcdl->vdev_names = vdev_names; vcdl->vdev_names_count = vdev_names_count; vcdl->cb_name_flags = cb_name_flags; vcdl->g_zfs = g_zfs; /* Gather our list of all vdevs in all pools */ for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, all_pools_for_each_vdev_gather_cb, vcdl); /* Run command on all vdevs in all pools */ all_pools_for_each_vdev_run_vcdl(vcdl); /* * vcdl->data[] now contains all the column names and values for each * vdev. We need to process that into a master list of unique column * names, and figure out the width of each column. */ process_unique_cmd_columns(vcdl); return (vcdl); } /* * Free the vdev_cmd_data_list_t created by all_pools_for_each_vdev_run() */ void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl) { free(vcdl->uniq_cols); free(vcdl->uniq_cols_width); for (int i = 0; i < vcdl->count; i++) { free(vcdl->data[i].path); free(vcdl->data[i].pool); free(vcdl->data[i].upath); for (int j = 0; j < vcdl->data[i].lines_cnt; j++) free(vcdl->data[i].lines[j]); free(vcdl->data[i].lines); for (int j = 0; j < vcdl->data[i].cols_cnt; j++) free(vcdl->data[i].cols[j]); free(vcdl->data[i].cols); free(vcdl->data[i].vdev_enc_sysfs_path); } free(vcdl->data); free(vcdl); } diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.h b/sys/contrib/openzfs/cmd/zpool/zpool_util.h index b35dea0cd449..db8e631dc6be 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_util.h +++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.h @@ -1,141 +1,145 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef ZPOOL_UTIL_H #define ZPOOL_UTIL_H #include #include #include #ifdef __cplusplus extern "C" { #endif /* Path to scripts you can run with "zpool status/iostat -c" */ #define ZPOOL_SCRIPTS_DIR SYSCONFDIR"/zfs/zpool.d" /* * Basic utility functions */ void *safe_malloc(size_t); void *safe_realloc(void *, size_t); void zpool_no_memory(void); uint_t num_logs(nvlist_t *nv); uint64_t array64_max(uint64_t array[], unsigned int len); int highbit64(uint64_t i); int lowbit64(uint64_t i); /* * Misc utility functions */ char *zpool_get_cmd_search_path(void); /* * Virtual device functions */ nvlist_t *make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, boolean_t replacing, boolean_t dryrun, int argc, char **argv); nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, splitflags_t flags, int argc, char **argv); /* * Pool list functions */ int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, zfs_type_t, boolean_t, zpool_iter_f, void *); /* Vdev list functions */ int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data); typedef struct zpool_list zpool_list_t; zpool_list_t *pool_list_get(int, char **, zprop_list_t **, zfs_type_t, boolean_t, int *); void pool_list_update(zpool_list_t *); int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *); void pool_list_free(zpool_list_t *); int pool_list_count(zpool_list_t *); void pool_list_remove(zpool_list_t *, zpool_handle_t *); extern libzfs_handle_t *g_zfs; typedef struct vdev_cmd_data { char **lines; /* Array of lines of output, minus the column name */ int lines_cnt; /* Number of lines in the array */ char **cols; /* Array of column names */ int cols_cnt; /* Number of column names */ char *path; /* vdev path */ char *upath; /* vdev underlying path */ char *pool; /* Pool name */ char *cmd; /* backpointer to cmd */ char *vdev_enc_sysfs_path; /* enclosure sysfs path (if any) */ } vdev_cmd_data_t; typedef struct vdev_cmd_data_list { char *cmd; /* Command to run */ unsigned int count; /* Number of vdev_cmd_data items (vdevs) */ /* fields used to select only certain vdevs, if requested */ libzfs_handle_t *g_zfs; char **vdev_names; int vdev_names_count; int cb_name_flags; vdev_cmd_data_t *data; /* Array of vdevs */ /* List of unique column names and widths */ char **uniq_cols; int uniq_cols_cnt; int *uniq_cols_width; } vdev_cmd_data_list_t; vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv, char *cmd, libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count, int cb_name_flags); void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl); +void free_vdev_cmd_data(vdev_cmd_data_t *data); + +int vdev_run_cmd_simple(char *path, char *cmd); + int check_device(const char *path, boolean_t force, boolean_t isspare, boolean_t iswholedisk); boolean_t check_sector_size_database(char *path, int *sector_size); void vdev_error(const char *fmt, ...) __attribute__((format(printf, 1, 2))); int check_file(const char *file, boolean_t force, boolean_t isspare); void after_zpool_upgrade(zpool_handle_t *zhp); int check_file_generic(const char *file, boolean_t force, boolean_t isspare); #ifdef __cplusplus } #endif #endif /* ZPOOL_UTIL_H */ diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c index 99a521aa2a28..3d0fc089c32f 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c @@ -1,1882 +1,1911 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright (c) 2016, 2017 Intel Corporation. * Copyright 2016 Igor Kozhukhov . */ /* * Functions to convert between a list of vdevs and an nvlist representing the * configuration. Each entry in the list can be one of: * * Device vdevs * disk=(path=..., devid=...) * file=(path=...) * * Group vdevs * raidz[1|2]=(...) * mirror=(...) * * Hot spares * * While the underlying implementation supports it, group vdevs cannot contain * other group vdevs. All userland verification of devices is contained within * this file. If successful, the nvlist returned can be passed directly to the * kernel; we've done as much verification as possible in userland. * * Hot spares are a special case, and passed down as an array of disk vdevs, at * the same level as the root of the vdev tree. * * The only function exported by this file is 'make_root_vdev'. The * function performs several passes: * * 1. Construct the vdev specification. Performs syntax validation and * makes sure each device is valid. * 2. Check for devices in use. Using libblkid to make sure that no * devices are also in use. Some can be overridden using the 'force' * flag, others cannot. * 3. Check for replication errors if the 'force' flag is not specified. * validates that the replication level is consistent across the * entire pool. * 4. Call libzfs to label any whole disks with an EFI label. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" #include #include /* * For any given vdev specification, we can have multiple errors. The * vdev_error() function keeps track of whether we have seen an error yet, and * prints out a header if its the first error we've seen. */ boolean_t error_seen; boolean_t is_force; void vdev_error(const char *fmt, ...) { va_list ap; if (!error_seen) { (void) fprintf(stderr, gettext("invalid vdev specification\n")); if (!is_force) (void) fprintf(stderr, gettext("use '-f' to override " "the following errors:\n")); else (void) fprintf(stderr, gettext("the following errors " "must be manually repaired:\n")); error_seen = B_TRUE; } va_start(ap, fmt); (void) vfprintf(stderr, fmt, ap); va_end(ap); } /* * Check that a file is valid. All we can do in this case is check that it's * not in use by another pool, and not in use by swap. */ int check_file_generic(const char *file, boolean_t force, boolean_t isspare) { char *name; int fd; int ret = 0; pool_state_t state; boolean_t inuse; if ((fd = open(file, O_RDONLY)) < 0) return (0); if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { const char *desc; switch (state) { case POOL_STATE_ACTIVE: desc = gettext("active"); break; case POOL_STATE_EXPORTED: desc = gettext("exported"); break; case POOL_STATE_POTENTIALLY_ACTIVE: desc = gettext("potentially active"); break; default: desc = gettext("unknown"); break; } /* * Allow hot spares to be shared between pools. */ if (state == POOL_STATE_SPARE && isspare) { free(name); (void) close(fd); return (0); } if (state == POOL_STATE_ACTIVE || state == POOL_STATE_SPARE || !force) { switch (state) { case POOL_STATE_SPARE: vdev_error(gettext("%s is reserved as a hot " "spare for pool %s\n"), file, name); break; default: vdev_error(gettext("%s is part of %s pool " "'%s'\n"), file, desc, name); break; } ret = -1; } free(name); } (void) close(fd); return (ret); } /* * This may be a shorthand device path or it could be total gibberish. * Check to see if it is a known device available in zfs_vdev_paths. * As part of this check, see if we've been given an entire disk * (minus the slice number). */ static int is_shorthand_path(const char *arg, char *path, size_t path_size, struct stat64 *statbuf, boolean_t *wholedisk) { int error; error = zfs_resolve_shortname(arg, path, path_size); if (error == 0) { *wholedisk = zfs_dev_is_whole_disk(path); if (*wholedisk || (stat64(path, statbuf) == 0)) return (0); } strlcpy(path, arg, path_size); memset(statbuf, 0, sizeof (*statbuf)); *wholedisk = B_FALSE; return (error); } /* * Determine if the given path is a hot spare within the given configuration. * If no configuration is given we rely solely on the label. */ static boolean_t is_spare(nvlist_t *config, const char *path) { int fd; pool_state_t state; char *name = NULL; nvlist_t *label; uint64_t guid, spareguid; nvlist_t *nvroot; nvlist_t **spares; uint_t i, nspares; boolean_t inuse; if (zpool_is_draid_spare(path)) return (B_TRUE); if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) return (B_FALSE); if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || !inuse || state != POOL_STATE_SPARE || zpool_read_label(fd, &label, NULL) != 0) { free(name); (void) close(fd); return (B_FALSE); } free(name); (void) close(fd); if (config == NULL) { nvlist_free(label); return (B_TRUE); } verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); nvlist_free(label); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { for (i = 0; i < nspares; i++) { verify(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &spareguid) == 0); if (spareguid == guid) return (B_TRUE); } } return (B_FALSE); } /* * Create a leaf vdev. Determine if this is a file or a device. If it's a * device, fill in the device id to make a complete nvlist. Valid forms for a * leaf vdev are: * * /dev/xxx Complete disk path * /xxx Full path to file * xxx Shorthand for /xxx * draid* Virtual dRAID spare */ static nvlist_t * make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) { char path[MAXPATHLEN]; struct stat64 statbuf; nvlist_t *vdev = NULL; const char *type = NULL; boolean_t wholedisk = B_FALSE; uint64_t ashift = 0; int err; /* * Determine what type of vdev this is, and put the full path into * 'path'. We detect whether this is a device of file afterwards by * checking the st_mode of the file. */ if (arg[0] == '/') { /* * Complete device or file path. Exact type is determined by * examining the file descriptor afterwards. Symbolic links * are resolved to their real paths to determine whole disk * and S_ISBLK/S_ISREG type checks. However, we are careful * to store the given path as ZPOOL_CONFIG_PATH to ensure we * can leverage udev's persistent device labels. */ if (realpath(arg, path) == NULL) { (void) fprintf(stderr, gettext("cannot resolve path '%s'\n"), arg); return (NULL); } wholedisk = zfs_dev_is_whole_disk(path); if (!wholedisk && (stat64(path, &statbuf) != 0)) { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), path, strerror(errno)); return (NULL); } /* After whole disk check restore original passed path */ strlcpy(path, arg, sizeof (path)); } else if (zpool_is_draid_spare(arg)) { if (!is_primary) { (void) fprintf(stderr, gettext("cannot open '%s': dRAID spares can only " "be used to replace primary vdevs\n"), arg); return (NULL); } wholedisk = B_TRUE; strlcpy(path, arg, sizeof (path)); type = VDEV_TYPE_DRAID_SPARE; } else { err = is_shorthand_path(arg, path, sizeof (path), &statbuf, &wholedisk); if (err != 0) { /* * If we got ENOENT, then the user gave us * gibberish, so try to direct them with a * reasonable error message. Otherwise, * regurgitate strerror() since it's the best we * can do. */ if (err == ENOENT) { (void) fprintf(stderr, gettext("cannot open '%s': no such " "device in %s\n"), arg, DISK_ROOT); (void) fprintf(stderr, gettext("must be a full path or " "shorthand device name\n")); return (NULL); } else { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), path, strerror(errno)); return (NULL); } } } if (type == NULL) { /* * Determine whether this is a device or a file. */ if (wholedisk || S_ISBLK(statbuf.st_mode)) { type = VDEV_TYPE_DISK; } else if (S_ISREG(statbuf.st_mode)) { type = VDEV_TYPE_FILE; } else { fprintf(stderr, gettext("cannot use '%s': must " "be a block device or regular file\n"), path); return (NULL); } } /* * Finally, we have the complete device or file, and we know that it is * acceptable to use. Construct the nvlist to describe this vdev. All * vdevs have a 'path' element, and devices also have a 'devid' element. */ verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); /* * Override defaults if custom properties are provided. */ if (props != NULL) { const char *value = NULL; if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { (void) fprintf(stderr, gettext("ashift must be a number.\n")); return (NULL); } if (ashift != 0 && (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { (void) fprintf(stderr, gettext("invalid 'ashift=%" PRIu64 "' " "property: only values between %" PRId32 " " "and %" PRId32 " are allowed.\n"), ashift, ASHIFT_MIN, ASHIFT_MAX); return (NULL); } } } /* * If the device is known to incorrectly report its physical sector * size explicitly provide the known correct value. */ if (ashift == 0) { int sector_size; if (check_sector_size_database(path, §or_size) == B_TRUE) ashift = highbit64(sector_size) - 1; } if (ashift > 0) (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift); return (vdev); } /* * Go through and verify the replication level of the pool is consistent. * Performs the following checks: * * For the new spec, verifies that devices in mirrors and raidz are the * same size. * * If the current configuration already has inconsistent replication * levels, ignore any other potential problems in the new spec. * * Otherwise, make sure that the current spec (if there is one) and the new * spec have consistent replication levels. * * If there is no current spec (create), make sure new spec has at least * one general purpose vdev. */ typedef struct replication_level { const char *zprl_type; uint64_t zprl_children; uint64_t zprl_parity; } replication_level_t; #define ZPOOL_FUZZ (16 * 1024 * 1024) /* * N.B. For the purposes of comparing replication levels dRAID can be * considered functionally equivalent to raidz. */ static boolean_t is_raidz_mirror(replication_level_t *a, replication_level_t *b, replication_level_t **raidz, replication_level_t **mirror) { if ((strcmp(a->zprl_type, "raidz") == 0 || strcmp(a->zprl_type, "draid") == 0) && strcmp(b->zprl_type, "mirror") == 0) { *raidz = a; *mirror = b; return (B_TRUE); } return (B_FALSE); } /* * Comparison for determining if dRAID and raidz where passed in either order. */ static boolean_t is_raidz_draid(replication_level_t *a, replication_level_t *b) { if ((strcmp(a->zprl_type, "raidz") == 0 || strcmp(a->zprl_type, "draid") == 0) && (strcmp(b->zprl_type, "raidz") == 0 || strcmp(b->zprl_type, "draid") == 0)) { return (B_TRUE); } return (B_FALSE); } /* * Given a list of toplevel vdevs, return the current replication level. If * the config is inconsistent, then NULL is returned. If 'fatal' is set, then * an error message will be displayed for each self-inconsistent vdev. */ static replication_level_t * get_replication(nvlist_t *nvroot, boolean_t fatal) { nvlist_t **top; uint_t t, toplevels; nvlist_t **child; uint_t c, children; nvlist_t *nv; const char *type; replication_level_t lastrep = {0}; replication_level_t rep; replication_level_t *ret; replication_level_t *raidz, *mirror; boolean_t dontreport; ret = safe_malloc(sizeof (replication_level_t)); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &top, &toplevels) == 0); for (t = 0; t < toplevels; t++) { uint64_t is_log = B_FALSE; nv = top[t]; /* * For separate logs we ignore the top level vdev replication * constraints. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) continue; /* * Ignore holes introduced by removing aux devices, along * with indirect vdevs introduced by previously removed * vdevs. */ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_HOLE) == 0 || strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { /* * This is a 'file' or 'disk' vdev. */ rep.zprl_type = type; rep.zprl_children = 1; rep.zprl_parity = 0; } else { int64_t vdev_size; /* * This is a mirror or RAID-Z vdev. Go through and make * sure the contents are all the same (files vs. disks), * keeping track of the number of elements in the * process. * * We also check that the size of each vdev (if it can * be determined) is the same. */ rep.zprl_type = type; rep.zprl_children = 0; if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); assert(rep.zprl_parity != 0); } else { rep.zprl_parity = 0; } /* * The 'dontreport' variable indicates that we've * already reported an error for this spec, so don't * bother doing it again. */ type = NULL; dontreport = 0; vdev_size = -1LL; for (c = 0; c < children; c++) { nvlist_t *cnv = child[c]; const char *path; struct stat64 statbuf; int64_t size = -1LL; const char *childtype; int fd, err; rep.zprl_children++; verify(nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &childtype) == 0); /* * If this is a replacing or spare vdev, then * get the real first child of the vdev: do this * in a loop because replacing and spare vdevs * can be nested. */ while (strcmp(childtype, VDEV_TYPE_REPLACING) == 0 || strcmp(childtype, VDEV_TYPE_SPARE) == 0) { nvlist_t **rchild; uint_t rchildren; verify(nvlist_lookup_nvlist_array(cnv, ZPOOL_CONFIG_CHILDREN, &rchild, &rchildren) == 0); assert(rchildren == 2); cnv = rchild[0]; verify(nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &childtype) == 0); } verify(nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &path) == 0); /* * If we have a raidz/mirror that combines disks * with files, report it as an error. */ if (!dontreport && type != NULL && strcmp(type, childtype) != 0) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication " "level: %s contains both " "files and devices\n"), rep.zprl_type); else return (NULL); dontreport = B_TRUE; } /* * According to stat(2), the value of 'st_size' * is undefined for block devices and character * devices. But there is no effective way to * determine the real size in userland. * * Instead, we'll take advantage of an * implementation detail of spec_size(). If the * device is currently open, then we (should) * return a valid size. * * If we still don't get a valid size (indicated * by a size of 0 or MAXOFFSET_T), then ignore * this device altogether. */ if ((fd = open(path, O_RDONLY)) >= 0) { err = fstat64_blk(fd, &statbuf); (void) close(fd); } else { err = stat64(path, &statbuf); } if (err != 0 || statbuf.st_size == 0 || statbuf.st_size == MAXOFFSET_T) continue; size = statbuf.st_size; /* * Also make sure that devices and * slices have a consistent size. If * they differ by a significant amount * (~16MB) then report an error. */ if (!dontreport && (vdev_size != -1LL && (llabs(size - vdev_size) > ZPOOL_FUZZ))) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "%s contains devices of " "different sizes\n"), rep.zprl_type); else return (NULL); dontreport = B_TRUE; } type = childtype; vdev_size = size; } } /* * At this point, we have the replication of the last toplevel * vdev in 'rep'. Compare it to 'lastrep' to see if it is * different. */ if (lastrep.zprl_type != NULL) { if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { /* * Accepted raidz and mirror when they can * handle the same number of disk failures. */ if (raidz->zprl_parity != mirror->zprl_children - 1) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication " "level: " "%s and %s vdevs with " "different redundancy, " "%llu vs. %llu (%llu-way) " "are present\n"), raidz->zprl_type, mirror->zprl_type, (u_longlong_t) raidz->zprl_parity, (u_longlong_t) mirror->zprl_children - 1, (u_longlong_t) mirror->zprl_children); else return (NULL); } } else if (is_raidz_draid(&lastrep, &rep)) { /* * Accepted raidz and draid when they can * handle the same number of disk failures. */ if (lastrep.zprl_parity != rep.zprl_parity) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication " "level: %s and %s vdevs " "with different " "redundancy, %llu vs. " "%llu are present\n"), lastrep.zprl_type, rep.zprl_type, (u_longlong_t) lastrep.zprl_parity, (u_longlong_t) rep.zprl_parity); else return (NULL); } } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication level: " "both %s and %s vdevs are " "present\n"), lastrep.zprl_type, rep.zprl_type); else return (NULL); } else if (lastrep.zprl_parity != rep.zprl_parity) { if (ret) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication level: " "both %llu and %llu device parity " "%s vdevs are present\n"), (u_longlong_t) lastrep.zprl_parity, (u_longlong_t)rep.zprl_parity, rep.zprl_type); else return (NULL); } else if (lastrep.zprl_children != rep.zprl_children) { if (ret) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication level: " "both %llu-way and %llu-way %s " "vdevs are present\n"), (u_longlong_t) lastrep.zprl_children, (u_longlong_t) rep.zprl_children, rep.zprl_type); else return (NULL); } } lastrep = rep; } if (ret != NULL) *ret = rep; return (ret); } /* * Check the replication level of the vdev spec against the current pool. Calls * get_replication() to make sure the new spec is self-consistent. If the pool * has a consistent replication level, then we ignore any errors. Otherwise, * report any difference between the two. */ static int check_replication(nvlist_t *config, nvlist_t *newroot) { nvlist_t **child; uint_t children; replication_level_t *current = NULL, *new; replication_level_t *raidz, *mirror; int ret; /* * If we have a current pool configuration, check to see if it's * self-consistent. If not, simply return success. */ if (config != NULL) { nvlist_t *nvroot; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if ((current = get_replication(nvroot, B_FALSE)) == NULL) return (0); } /* * for spares there may be no children, and therefore no * replication level to check */ if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) || (children == 0)) { free(current); return (0); } /* * If all we have is logs then there's no replication level to check. */ if (num_logs(newroot) == children) { free(current); return (0); } /* * Get the replication level of the new vdev spec, reporting any * inconsistencies found. */ if ((new = get_replication(newroot, B_TRUE)) == NULL) { free(current); return (-1); } /* * Check to see if the new vdev spec matches the replication level of * the current pool. */ ret = 0; if (current != NULL) { if (is_raidz_mirror(current, new, &raidz, &mirror) || is_raidz_mirror(new, current, &raidz, &mirror)) { if (raidz->zprl_parity != mirror->zprl_children - 1) { vdev_error(gettext( "mismatched replication level: pool and " "new vdev with different redundancy, %s " "and %s vdevs, %llu vs. %llu (%llu-way)\n"), raidz->zprl_type, mirror->zprl_type, (u_longlong_t)raidz->zprl_parity, (u_longlong_t)mirror->zprl_children - 1, (u_longlong_t)mirror->zprl_children); ret = -1; } } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { vdev_error(gettext( "mismatched replication level: pool uses %s " "and new vdev is %s\n"), current->zprl_type, new->zprl_type); ret = -1; } else if (current->zprl_parity != new->zprl_parity) { vdev_error(gettext( "mismatched replication level: pool uses %llu " "device parity and new vdev uses %llu\n"), (u_longlong_t)current->zprl_parity, (u_longlong_t)new->zprl_parity); ret = -1; } else if (current->zprl_children != new->zprl_children) { vdev_error(gettext( "mismatched replication level: pool uses %llu-way " "%s and new vdev uses %llu-way %s\n"), (u_longlong_t)current->zprl_children, current->zprl_type, (u_longlong_t)new->zprl_children, new->zprl_type); ret = -1; } } free(new); if (current != NULL) free(current); return (ret); } static int zero_label(const char *path) { const int size = 4096; char buf[size]; int err, fd; if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), path, strerror(errno)); return (-1); } memset(buf, 0, size); err = write(fd, buf, size); (void) fdatasync(fd); (void) close(fd); if (err == -1) { (void) fprintf(stderr, gettext("cannot zero first %d bytes " "of '%s': %s\n"), size, path, strerror(errno)); return (-1); } if (err != size) { (void) fprintf(stderr, gettext("could only zero %d/%d bytes " "of '%s'\n"), err, size, path); return (-1); } return (0); } +static void +lines_to_stderr(char *lines[], int lines_cnt) +{ + int i; + for (i = 0; i < lines_cnt; i++) { + fprintf(stderr, "%s\n", lines[i]); + } +} + /* * Go through and find any whole disks in the vdev specification, labelling them * as appropriate. When constructing the vdev spec, we were unable to open this * device in order to provide a devid. Now that we have labelled the disk and * know that slice 0 is valid, we can construct the devid now. * * If the disk was already labeled with an EFI label, we will have gotten the * devid already (because we were able to open the whole disk). Otherwise, we * need to get the devid after we label the disk. */ static int -make_disks(zpool_handle_t *zhp, nvlist_t *nv) +make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing) { nvlist_t **child; uint_t c, children; const char *type, *path; char devpath[MAXPATHLEN]; char udevpath[MAXPATHLEN]; uint64_t wholedisk; struct stat64 statbuf; int is_exclusive = 0; int fd; int ret; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (strcmp(type, VDEV_TYPE_DISK) != 0) return (0); /* * We have a disk device. If this is a whole disk write * out the efi partition table, otherwise write zero's to * the first 4k of the partition. This is to ensure that * libblkid will not misidentify the partition due to a * magic value left by the previous filesystem. */ verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); if (!wholedisk) { /* * Update device id string for mpath nodes (Linux only) */ if (is_mpath_whole_disk(path)) update_vdev_config_dev_strs(nv); if (!is_spare(NULL, path)) (void) zero_label(path); return (0); } if (realpath(path, devpath) == NULL) { ret = errno; (void) fprintf(stderr, gettext("cannot resolve path '%s'\n"), path); return (ret); } /* * Remove any previously existing symlink from a udev path to * the device before labeling the disk. This ensures that * only newly created links are used. Otherwise there is a * window between when udev deletes and recreates the link * during which access attempts will fail with ENOENT. */ strlcpy(udevpath, path, MAXPATHLEN); (void) zfs_append_partition(udevpath, MAXPATHLEN); fd = open(devpath, O_RDWR|O_EXCL); if (fd == -1) { if (errno == EBUSY) is_exclusive = 1; #ifdef __FreeBSD__ if (errno == EPERM) is_exclusive = 1; #endif } else { (void) close(fd); } /* * If the partition exists, contains a valid spare label, * and is opened exclusively there is no need to partition * it. Hot spares have already been partitioned and are * held open exclusively by the kernel as a safety measure. * * If the provided path is for a /dev/disk/ device its * symbolic link will be removed, partition table created, * and then block until udev creates the new link. */ if (!is_exclusive && !is_spare(NULL, udevpath)) { char *devnode = strrchr(devpath, '/') + 1; + char **lines = NULL; + int lines_cnt = 0; ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); if (ret == 0) { ret = lstat64(udevpath, &statbuf); if (ret == 0 && S_ISLNK(statbuf.st_mode)) (void) unlink(udevpath); } /* * When labeling a pool the raw device node name * is provided as it appears under /dev/. + * + * Note that 'zhp' will be NULL when we're creating a + * pool. */ - if (zpool_label_disk(g_zfs, zhp, devnode) == -1) + if (zpool_prepare_and_label_disk(g_zfs, zhp, devnode, + nv, zhp == NULL ? "create" : + replacing ? "replace" : "add", &lines, + &lines_cnt) != 0) { + (void) fprintf(stderr, + gettext( + "Error preparing/labeling disk.\n")); + if (lines_cnt > 0) { + (void) fprintf(stderr, + gettext("zfs_prepare_disk output:\n")); + lines_to_stderr(lines, lines_cnt); + } + + libzfs_free_str_array(lines, lines_cnt); return (-1); + } + libzfs_free_str_array(lines, lines_cnt); /* * Wait for udev to signal the device is available * by the provided path. */ ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); if (ret) { (void) fprintf(stderr, gettext("missing link: %s was " "partitioned but %s is missing\n"), devnode, udevpath); return (ret); } ret = zero_label(udevpath); if (ret) return (ret); } /* * Update the path to refer to the partition. The presence of * the 'whole_disk' field indicates to the CLI that we should * chop off the partition number when displaying the device in * future output. */ verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0); /* * Update device id strings for whole disks (Linux only) */ update_vdev_config_dev_strs(nv); return (0); } for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) for (c = 0; c < children; c++) - if ((ret = make_disks(zhp, child[c])) != 0) + if ((ret = make_disks(zhp, child[c], replacing)) != 0) return (ret); return (0); } /* * Go through and find any devices that are in use. We rely on libdiskmgt for * the majority of this task. */ static boolean_t is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, boolean_t replacing, boolean_t isspare) { nvlist_t **child; uint_t c, children; const char *type, *path; int ret = 0; char buf[MAXPATHLEN]; uint64_t wholedisk = B_FALSE; boolean_t anyinuse = B_FALSE; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); /* * As a generic check, we look to see if this is a replace of a * hot spare within the same pool. If so, we allow it * regardless of what libblkid or zpool_in_use() says. */ if (replacing) { (void) strlcpy(buf, path, sizeof (buf)); if (wholedisk) { ret = zfs_append_partition(buf, sizeof (buf)); if (ret == -1) return (-1); } if (is_spare(config, buf)) return (B_FALSE); } if (strcmp(type, VDEV_TYPE_DISK) == 0) ret = check_device(path, force, isspare, wholedisk); else if (strcmp(type, VDEV_TYPE_FILE) == 0) ret = check_file(path, force, isspare); return (ret != 0); } for (c = 0; c < children; c++) if (is_device_in_use(config, child[c], force, replacing, B_FALSE)) anyinuse = B_TRUE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) for (c = 0; c < children; c++) if (is_device_in_use(config, child[c], force, replacing, B_TRUE)) anyinuse = B_TRUE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) for (c = 0; c < children; c++) if (is_device_in_use(config, child[c], force, replacing, B_FALSE)) anyinuse = B_TRUE; return (anyinuse); } /* * Returns the parity level extracted from a raidz or draid type. * If the parity cannot be determined zero is returned. */ static int get_parity(const char *type) { long parity = 0; const char *p; if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { p = type + strlen(VDEV_TYPE_RAIDZ); if (*p == '\0') { /* when unspecified default to single parity */ return (1); } else if (*p == '0') { /* no zero prefixes allowed */ return (0); } else { /* 0-3, no suffixes allowed */ char *end; errno = 0; parity = strtol(p, &end, 10); if (errno != 0 || *end != '\0' || parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { return (0); } } } else if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { p = type + strlen(VDEV_TYPE_DRAID); if (*p == '\0' || *p == ':') { /* when unspecified default to single parity */ return (1); } else if (*p == '0') { /* no zero prefixes allowed */ return (0); } else { /* 0-3, allowed suffixes: '\0' or ':' */ char *end; errno = 0; parity = strtol(p, &end, 10); if (errno != 0 || parity < 1 || parity > VDEV_DRAID_MAXPARITY || (*end != '\0' && *end != ':')) { return (0); } } } return ((int)parity); } /* * Assign the minimum and maximum number of devices allowed for * the specified type. On error NULL is returned, otherwise the * type prefix is returned (raidz, mirror, etc). */ static const char * is_grouping(const char *type, int *mindev, int *maxdev) { int nparity; if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { nparity = get_parity(type); if (nparity == 0) return (NULL); if (mindev != NULL) *mindev = nparity + 1; if (maxdev != NULL) *maxdev = 255; if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { return (VDEV_TYPE_RAIDZ); } else { return (VDEV_TYPE_DRAID); } } if (maxdev != NULL) *maxdev = INT_MAX; if (strcmp(type, "mirror") == 0) { if (mindev != NULL) *mindev = 2; return (VDEV_TYPE_MIRROR); } if (strcmp(type, "spare") == 0) { if (mindev != NULL) *mindev = 1; return (VDEV_TYPE_SPARE); } if (strcmp(type, "log") == 0) { if (mindev != NULL) *mindev = 1; return (VDEV_TYPE_LOG); } if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { if (mindev != NULL) *mindev = 1; return (type); } if (strcmp(type, "cache") == 0) { if (mindev != NULL) *mindev = 1; return (VDEV_TYPE_L2CACHE); } return (NULL); } /* * Extract the configuration parameters encoded in the dRAID type and * use them to generate a dRAID configuration. The expected format is: * * draid[][:][:][:] * * The intent is to be able to generate a good configuration when no * additional information is provided. The only mandatory component * of the 'type' is the 'draid' prefix. If a value is not provided * then reasonable defaults are used. The optional components may * appear in any order but the d/s/c suffix is required. * * Valid inputs: * - data: number of data devices per group (1-255) * - parity: number of parity blocks per group (1-3) * - spares: number of distributed spare (0-100) * - children: total number of devices (1-255) * * Examples: * - zpool create tank draid * - zpool create tank draid2:8d:51c:2s */ static int draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) { uint64_t nparity = 1; uint64_t nspares = 0; uint64_t ndata = UINT64_MAX; uint64_t ngroups = 1; long value; if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) return (EINVAL); nparity = (uint64_t)get_parity(type); if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { fprintf(stderr, gettext("invalid dRAID parity level %llu; must be " "between 1 and %d\n"), (u_longlong_t)nparity, VDEV_DRAID_MAXPARITY); return (EINVAL); } char *p = (char *)type; while ((p = strchr(p, ':')) != NULL) { char *end; p = p + 1; errno = 0; if (!isdigit(p[0])) { (void) fprintf(stderr, gettext("invalid dRAID " "syntax; expected [:] not '%s'\n"), type); return (EINVAL); } /* Expected non-zero value with c/d/s suffix */ value = strtol(p, &end, 10); char suffix = tolower(*end); if (errno != 0 || (suffix != 'c' && suffix != 'd' && suffix != 's')) { (void) fprintf(stderr, gettext("invalid dRAID " "syntax; expected [:] not '%s'\n"), type); return (EINVAL); } if (suffix == 'c') { if ((uint64_t)value != children) { fprintf(stderr, gettext("invalid number of dRAID children; " "%llu required but %llu provided\n"), (u_longlong_t)value, (u_longlong_t)children); return (EINVAL); } } else if (suffix == 'd') { ndata = (uint64_t)value; } else if (suffix == 's') { nspares = (uint64_t)value; } else { verify(0); /* Unreachable */ } } /* * When a specific number of data disks is not provided limit a * redundancy group to 8 data disks. This value was selected to * provide a reasonable tradeoff between capacity and performance. */ if (ndata == UINT64_MAX) { if (children > nspares + nparity) { ndata = MIN(children - nspares - nparity, 8); } else { fprintf(stderr, gettext("request number of " "distributed spares %llu and parity level %llu\n" "leaves no disks available for data\n"), (u_longlong_t)nspares, (u_longlong_t)nparity); return (EINVAL); } } /* Verify the maximum allowed group size is never exceeded. */ if (ndata == 0 || (ndata + nparity > children - nspares)) { fprintf(stderr, gettext("requested number of dRAID data " "disks per group %llu is too high,\nat most %llu disks " "are available for data\n"), (u_longlong_t)ndata, (u_longlong_t)(children - nspares - nparity)); return (EINVAL); } /* * Verify the requested number of spares can be satisfied. * An arbitrary limit of 100 distributed spares is applied. */ if (nspares > 100 || nspares > (children - (ndata + nparity))) { fprintf(stderr, gettext("invalid number of dRAID spares %llu; additional " "disks would be required\n"), (u_longlong_t)nspares); return (EINVAL); } /* Verify the requested number children is sufficient. */ if (children < (ndata + nparity + nspares)) { fprintf(stderr, gettext("%llu disks were provided, but at " "least %llu disks are required for this config\n"), (u_longlong_t)children, (u_longlong_t)(ndata + nparity + nspares)); } if (children > VDEV_DRAID_MAX_CHILDREN) { fprintf(stderr, gettext("%llu disks were provided, but " "dRAID only supports up to %u disks"), (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); } /* * Calculate the minimum number of groups required to fill a slice. * This is the LCM of the stripe width (ndata + nparity) and the * number of data drives (children - nspares). */ while (ngroups * (ndata + nparity) % (children - nspares) != 0) ngroups++; /* Store the basic dRAID configuration. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); return (0); } /* * Construct a syntactically valid vdev specification, * and ensure that all devices and files exist and can be opened. * Note: we don't bother freeing anything in the error paths * because the program is just going to exit anyway. */ static nvlist_t * construct_spec(nvlist_t *props, int argc, char **argv) { nvlist_t *nvroot, *nv, **top, **spares, **l2cache; int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; const char *type, *fulltype; boolean_t is_log, is_special, is_dedup, is_spare; boolean_t seen_logs; top = NULL; toplevels = 0; spares = NULL; l2cache = NULL; nspares = 0; nlogs = 0; nl2cache = 0; is_log = is_special = is_dedup = is_spare = B_FALSE; seen_logs = B_FALSE; nvroot = NULL; while (argc > 0) { fulltype = argv[0]; nv = NULL; /* * If it's a mirror, raidz, or draid the subsequent arguments * are its leaves -- until we encounter the next mirror, * raidz or draid. */ if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; if (strcmp(type, VDEV_TYPE_SPARE) == 0) { if (spares != NULL) { (void) fprintf(stderr, gettext("invalid vdev " "specification: 'spare' can be " "specified only once\n")); goto spec_out; } is_spare = B_TRUE; is_log = is_special = is_dedup = B_FALSE; } if (strcmp(type, VDEV_TYPE_LOG) == 0) { if (seen_logs) { (void) fprintf(stderr, gettext("invalid vdev " "specification: 'log' can be " "specified only once\n")); goto spec_out; } seen_logs = B_TRUE; is_log = B_TRUE; is_special = is_dedup = is_spare = B_FALSE; argc--; argv++; /* * A log is not a real grouping device. * We just set is_log and continue. */ continue; } if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { is_special = B_TRUE; is_log = is_dedup = is_spare = B_FALSE; argc--; argv++; continue; } if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { is_dedup = B_TRUE; is_log = is_special = is_spare = B_FALSE; argc--; argv++; continue; } if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { if (l2cache != NULL) { (void) fprintf(stderr, gettext("invalid vdev " "specification: 'cache' can be " "specified only once\n")); goto spec_out; } is_log = is_special = B_FALSE; is_dedup = is_spare = B_FALSE; } if (is_log || is_special || is_dedup) { if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { (void) fprintf(stderr, gettext("invalid vdev " "specification: unsupported '%s' " "device: %s\n"), is_log ? "log" : "special", type); goto spec_out; } nlogs++; } for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; children++; child = realloc(child, children * sizeof (nvlist_t *)); if (child == NULL) zpool_no_memory(); if ((nv = make_leaf_vdev(props, argv[c], !(is_log || is_special || is_dedup || is_spare))) == NULL) { for (c = 0; c < children - 1; c++) nvlist_free(child[c]); free(child); goto spec_out; } child[children - 1] = nv; } if (children < mindev) { (void) fprintf(stderr, gettext("invalid vdev " "specification: %s requires at least %d " "devices\n"), argv[0], mindev); for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); goto spec_out; } if (children > maxdev) { (void) fprintf(stderr, gettext("invalid vdev " "specification: %s supports no more than " "%d devices\n"), argv[0], maxdev); for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); goto spec_out; } argc -= c; argv += c; if (strcmp(type, VDEV_TYPE_SPARE) == 0) { spares = child; nspares = children; continue; } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { l2cache = child; nl2cache = children; continue; } else { /* create a top-level vdev with children */ verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, type) == 0); verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); if (is_log) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_LOG) == 0); } if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_SPECIAL) == 0); } if (is_dedup) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_DEDUP) == 0); } if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); } if (strcmp(type, VDEV_TYPE_DRAID) == 0) { if (draid_config_by_type(nv, fulltype, children) != 0) { for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); goto spec_out; } } verify(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)child, children) == 0); for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); } } else { /* * We have a device. Pass off to make_leaf_vdev() to * construct the appropriate nvlist describing the vdev. */ if ((nv = make_leaf_vdev(props, argv[0], !(is_log || is_special || is_dedup || is_spare))) == NULL) goto spec_out; verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); if (is_log) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_LOG) == 0); nlogs++; } if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_SPECIAL) == 0); } if (is_dedup) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_DEDUP) == 0); } argc--; argv++; } toplevels++; top = realloc(top, toplevels * sizeof (nvlist_t *)); if (top == NULL) zpool_no_memory(); top[toplevels - 1] = nv; } if (toplevels == 0 && nspares == 0 && nl2cache == 0) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); goto spec_out; } if (seen_logs && nlogs == 0) { (void) fprintf(stderr, gettext("invalid vdev specification: " "log requires at least 1 device\n")); goto spec_out; } /* * Finally, create nvroot and add all top-level vdevs to it. */ verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)top, toplevels) == 0); if (nspares != 0) verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, (const nvlist_t **)spares, nspares) == 0); if (nl2cache != 0) verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, (const nvlist_t **)l2cache, nl2cache) == 0); spec_out: for (t = 0; t < toplevels; t++) nvlist_free(top[t]); for (t = 0; t < nspares; t++) nvlist_free(spares[t]); for (t = 0; t < nl2cache; t++) nvlist_free(l2cache[t]); free(spares); free(l2cache); free(top); return (nvroot); } nvlist_t * split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, splitflags_t flags, int argc, char **argv) { nvlist_t *newroot = NULL, **child; uint_t c, children; if (argc > 0) { if ((newroot = construct_spec(props, argc, argv)) == NULL) { (void) fprintf(stderr, gettext("Unable to build a " "pool from the specified devices\n")); return (NULL); } - if (!flags.dryrun && make_disks(zhp, newroot) != 0) { + if (!flags.dryrun && make_disks(zhp, newroot, B_FALSE) != 0) { nvlist_free(newroot); return (NULL); } /* avoid any tricks in the spec */ verify(nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); for (c = 0; c < children; c++) { const char *path; const char *type; int min, max; verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_PATH, &path) == 0); if ((type = is_grouping(path, &min, &max)) != NULL) { (void) fprintf(stderr, gettext("Cannot use " "'%s' as a device for splitting\n"), type); nvlist_free(newroot); return (NULL); } } } if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { nvlist_free(newroot); return (NULL); } return (newroot); } static int num_normal_vdevs(nvlist_t *nvroot) { nvlist_t **top; uint_t t, toplevels, normal = 0; verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &top, &toplevels) == 0); for (t = 0; t < toplevels; t++) { uint64_t log = B_FALSE; (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); if (log) continue; if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; normal++; } return (normal); } /* * Get and validate the contents of the given vdev specification. This ensures * that the nvlist returned is well-formed, that all the devices exist, and that * they are not currently in use by any other known consumer. The 'poolconfig' * parameter is the current configuration of the pool when adding devices * existing pool, and is used to perform additional checks, such as changing the * replication level of the pool. It can be 'NULL' to indicate that this is a * new pool. The 'force' flag controls whether devices should be forcefully * added, even if they appear in use. */ nvlist_t * make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, boolean_t replacing, boolean_t dryrun, int argc, char **argv) { nvlist_t *newroot; nvlist_t *poolconfig = NULL; is_force = force; /* * Construct the vdev specification. If this is successful, we know * that we have a valid specification, and that all devices can be * opened. */ if ((newroot = construct_spec(props, argc, argv)) == NULL) return (NULL); if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) { nvlist_free(newroot); return (NULL); } /* * Validate each device to make sure that it's not shared with another * subsystem. We do this even if 'force' is set, because there are some * uses (such as a dedicated dump device) that even '-f' cannot * override. */ if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { nvlist_free(newroot); return (NULL); } /* * Check the replication level of the given vdevs and report any errors * found. We include the existing pool spec, if any, as we need to * catch changes against the existing replication level. */ if (check_rep && check_replication(poolconfig, newroot) != 0) { nvlist_free(newroot); return (NULL); } /* * On pool create the new vdev spec must have one normal vdev. */ if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { vdev_error(gettext("at least one general top-level vdev must " "be specified\n")); nvlist_free(newroot); return (NULL); } /* * Run through the vdev specification and label any whole disks found. */ - if (!dryrun && make_disks(zhp, newroot) != 0) { + if (!dryrun && make_disks(zhp, newroot, replacing) != 0) { nvlist_free(newroot); return (NULL); } return (newroot); } diff --git a/sys/contrib/openzfs/config/Rules.am b/sys/contrib/openzfs/config/Rules.am index abb4ced33233..7c266964f3f3 100644 --- a/sys/contrib/openzfs/config/Rules.am +++ b/sys/contrib/openzfs/config/Rules.am @@ -1,85 +1,86 @@ # # Default build rules for all user space components, every Makefile.am # should include these rules and override or extend them as needed. # PHONY = AM_CPPFLAGS = \ -include $(top_builddir)/zfs_config.h \ -I$(top_builddir)/include \ -I$(top_srcdir)/include \ -I$(top_srcdir)/module/icp/include \ -I$(top_srcdir)/lib/libspl/include \ -I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ AM_LIBTOOLFLAGS = --silent AM_CFLAGS = -std=gnu99 -Wall -Wextra -Wstrict-prototypes -Wmissing-prototypes -Wwrite-strings -Wno-sign-compare -Wno-missing-field-initializers AM_CFLAGS += -fno-strict-aliasing AM_CFLAGS += $(NO_OMIT_FRAME_POINTER) AM_CFLAGS += $(IMPLICIT_FALLTHROUGH) AM_CFLAGS += $(DEBUG_CFLAGS) AM_CFLAGS += $(ASAN_CFLAGS) AM_CFLAGS += $(UBSAN_CFLAGS) AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) $(NO_FORMAT_ZERO_LENGTH) if BUILD_FREEBSD AM_CFLAGS += -fPIC -Werror -Wno-unknown-pragmas -Wno-enum-conversion AM_CFLAGS += -include $(top_srcdir)/include/os/freebsd/spl/sys/ccompile.h AM_CFLAGS += -I/usr/include -I/usr/local/include endif AM_CPPFLAGS += -D_GNU_SOURCE AM_CPPFLAGS += -D_REENTRANT AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64 AM_CPPFLAGS += -D_LARGEFILE64_SOURCE AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\" +AM_CPPFLAGS += -DZFSEXECDIR=\"$(zfsexecdir)\" AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\" AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\" AM_CPPFLAGS += -DSYSCONFDIR=\"$(sysconfdir)\" AM_CPPFLAGS += -DPKGDATADIR=\"$(pkgdatadir)\" AM_CPPFLAGS += $(DEBUG_CPPFLAGS) AM_CPPFLAGS += $(CODE_COVERAGE_CPPFLAGS) AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-@ac_system_l@-user\" AM_CPPFLAGS_NOCHECK = -D"strtok(...)=strtok(__VA_ARGS__) __attribute__((deprecated(\"Use strtok_r(3) instead!\")))" AM_CPPFLAGS_NOCHECK += -D"__xpg_basename(...)=__xpg_basename(__VA_ARGS__) __attribute__((deprecated(\"basename(3) is underspecified. Use zfs_basename() instead!\")))" AM_CPPFLAGS_NOCHECK += -D"basename(...)=basename(__VA_ARGS__) __attribute__((deprecated(\"basename(3) is underspecified. Use zfs_basename() instead!\")))" AM_CPPFLAGS_NOCHECK += -D"dirname(...)=dirname(__VA_ARGS__) __attribute__((deprecated(\"dirname(3) is underspecified. Use zfs_dirnamelen() instead!\")))" AM_CPPFLAGS_NOCHECK += -D"bcopy(...)=__attribute__((deprecated(\"bcopy(3) is deprecated. Use memcpy(3)/memmove(3) instead!\"))) bcopy(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"bcmp(...)=__attribute__((deprecated(\"bcmp(3) is deprecated. Use memcmp(3) instead!\"))) bcmp(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"bzero(...)=__attribute__((deprecated(\"bzero(3) is deprecated. Use memset(3) instead!\"))) bzero(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"asctime(...)=__attribute__((deprecated(\"Use strftime(3) instead!\"))) asctime(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"asctime_r(...)=__attribute__((deprecated(\"Use strftime(3) instead!\"))) asctime_r(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"gmtime(...)=__attribute__((deprecated(\"gmtime(3) isn't thread-safe. Use gmtime_r(3) instead!\"))) gmtime(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"localtime(...)=__attribute__((deprecated(\"localtime(3) isn't thread-safe. Use localtime_r(3) instead!\"))) localtime(__VA_ARGS__)" AM_CPPFLAGS_NOCHECK += -D"strncpy(...)=__attribute__((deprecated(\"strncpy(3) is deprecated. Use strlcpy(3) instead!\"))) strncpy(__VA_ARGS__)" AM_CPPFLAGS += $(AM_CPPFLAGS_NOCHECK) if ASAN_ENABLED AM_CPPFLAGS += -DZFS_ASAN_ENABLED endif if UBSAN_ENABLED AM_CPPFLAGS += -DZFS_UBSAN_ENABLED endif AM_LDFLAGS = $(DEBUG_LDFLAGS) AM_LDFLAGS += $(ASAN_LDFLAGS) AM_LDFLAGS += $(UBSAN_LDFLAGS) if BUILD_FREEBSD AM_LDFLAGS += -fstack-protector-strong -shared AM_LDFLAGS += -Wl,-x -Wl,--fatal-warnings -Wl,--warn-shared-textrel AM_LDFLAGS += -lm endif # If a target includes kernel code, generate warnings for large stack frames KERNEL_CFLAGS = $(FRAME_LARGER_THAN) # See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 LIBRARY_CFLAGS = -no-suppress # Forcibly enable asserts/debugging for libzpool &al. FORCEDEBUG_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG diff --git a/sys/contrib/openzfs/config/kernel-fsync-bdev.m4 b/sys/contrib/openzfs/config/kernel-fsync-bdev.m4 new file mode 100644 index 000000000000..c47e236f705f --- /dev/null +++ b/sys/contrib/openzfs/config/kernel-fsync-bdev.m4 @@ -0,0 +1,36 @@ +dnl # +dnl # 6.6 API change, +dnl # fsync_bdev was removed in favor of sync_blockdev +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SYNC_BDEV], [ + ZFS_LINUX_TEST_SRC([fsync_bdev], [ + #include + ],[ + fsync_bdev(NULL); + ]) + + ZFS_LINUX_TEST_SRC([sync_blockdev], [ + #include + ],[ + sync_blockdev(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SYNC_BDEV], [ + AC_MSG_CHECKING([whether fsync_bdev() exists]) + ZFS_LINUX_TEST_RESULT([fsync_bdev], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FSYNC_BDEV, 1, + [fsync_bdev() is declared in include/blkdev.h]) + ],[ + AC_MSG_CHECKING([whether sync_blockdev() exists]) + ZFS_LINUX_TEST_RESULT([sync_blockdev], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SYNC_BLOCKDEV, 1, + [sync_blockdev() is declared in include/blkdev.h]) + ],[ + ZFS_LINUX_TEST_ERROR( + [neither fsync_bdev() nor sync_blockdev() exist]) + ]) + ]) +]) diff --git a/sys/contrib/openzfs/config/kernel-generic_fillattr.m4 b/sys/contrib/openzfs/config/kernel-generic_fillattr.m4 index 02dee4d4c000..f5323f0dcb9f 100644 --- a/sys/contrib/openzfs/config/kernel-generic_fillattr.m4 +++ b/sys/contrib/openzfs/config/kernel-generic_fillattr.m4 @@ -1,47 +1,68 @@ dnl # dnl # 5.12 API dnl # dnl # generic_fillattr in linux/fs.h now requires a struct user_namespace* dnl # as the first arg, to support idmapped mounts. dnl # dnl # 6.3 API dnl # generic_fillattr() now takes struct mnt_idmap* as the first argument dnl # +dnl # 6.6 API +dnl # generic_fillattr() now takes u32 as second argument, representing a +dnl # request_mask for statx +dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR], [ ZFS_LINUX_TEST_SRC([generic_fillattr_userns], [ #include ],[ struct user_namespace *userns = NULL; struct inode *in = NULL; struct kstat *k = NULL; generic_fillattr(userns, in, k); ]) ZFS_LINUX_TEST_SRC([generic_fillattr_mnt_idmap], [ #include ],[ struct mnt_idmap *idmap = NULL; struct inode *in = NULL; struct kstat *k = NULL; generic_fillattr(idmap, in, k); ]) + + ZFS_LINUX_TEST_SRC([generic_fillattr_mnt_idmap_reqmask], [ + #include + ],[ + struct mnt_idmap *idmap = NULL; + struct inode *in = NULL; + struct kstat *k = NULL; + generic_fillattr(idmap, 0, in, k); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FILLATTR], [ - AC_MSG_CHECKING([whether generic_fillattr requires struct mnt_idmap*]) - ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap], [ + AC_MSG_CHECKING( + [whether generic_fillattr requires struct mnt_idmap* and request_mask]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap_reqmask], [ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP, 1, - [generic_fillattr requires struct mnt_idmap*]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK, 1, + [generic_fillattr requires struct mnt_idmap* and u32 request_mask]) ],[ - AC_MSG_CHECKING([whether generic_fillattr requires struct user_namespace*]) - ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [ + AC_MSG_CHECKING([whether generic_fillattr requires struct mnt_idmap*]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap], [ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1, - [generic_fillattr requires struct user_namespace*]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP, 1, + [generic_fillattr requires struct mnt_idmap*]) ],[ - AC_MSG_RESULT([no]) + AC_MSG_CHECKING([whether generic_fillattr requires struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1, + [generic_fillattr requires struct user_namespace*]) + ],[ + AC_MSG_RESULT([no]) + ]) ]) ]) ]) diff --git a/sys/contrib/openzfs/config/kernel-inode-times.m4 b/sys/contrib/openzfs/config/kernel-inode-times.m4 index 9c016c790081..412e13b47df5 100644 --- a/sys/contrib/openzfs/config/kernel-inode-times.m4 +++ b/sys/contrib/openzfs/config/kernel-inode-times.m4 @@ -1,50 +1,93 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [ dnl # dnl # 5.6 API change dnl # timespec64_trunc() replaced by timestamp_truncate() interface. dnl # ZFS_LINUX_TEST_SRC([timestamp_truncate], [ #include ],[ struct timespec64 ts; struct inode ip; memset(&ts, 0, sizeof(ts)); ts = timestamp_truncate(ts, &ip); ]) dnl # dnl # 4.18 API change dnl # i_atime, i_mtime, and i_ctime changed from timespec to timespec64. dnl # ZFS_LINUX_TEST_SRC([inode_times], [ #include ],[ struct inode ip; struct timespec ts; memset(&ip, 0, sizeof(ip)); ts = ip.i_mtime; ]) + + dnl # + dnl # 6.6 API change + dnl # i_ctime no longer directly accessible, must use + dnl # inode_get_ctime(ip), inode_set_ctime*(ip) to + dnl # read/write. + dnl # + ZFS_LINUX_TEST_SRC([inode_get_ctime], [ + #include + ],[ + struct inode ip; + + memset(&ip, 0, sizeof(ip)); + inode_get_ctime(&ip); + ]) + + ZFS_LINUX_TEST_SRC([inode_set_ctime_to_ts], [ + #include + ],[ + struct inode ip; + struct timespec64 ts; + + memset(&ip, 0, sizeof(ip)); + inode_set_ctime_to_ts(&ip, ts); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ AC_MSG_CHECKING([whether timestamp_truncate() exists]) ZFS_LINUX_TEST_RESULT([timestamp_truncate], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_TIMESTAMP_TRUNCATE, 1, [timestamp_truncate() exists]) ],[ AC_MSG_RESULT(no) ]) AC_MSG_CHECKING([whether inode->i_*time's are timespec64]) ZFS_LINUX_TEST_RESULT([inode_times], [ AC_MSG_RESULT(no) ],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_TIMESPEC64_TIMES, 1, [inode->i_*time's are timespec64]) ]) + + AC_MSG_CHECKING([whether inode_get_ctime() exists]) + ZFS_LINUX_TEST_RESULT([inode_get_ctime], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_GET_CTIME, 1, + [inode_get_ctime() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_set_ctime_to_ts() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_ctime_to_ts], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_CTIME_TO_TS, 1, + [inode_set_ctime_to_ts() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) ]) diff --git a/sys/contrib/openzfs/config/kernel.m4 b/sys/contrib/openzfs/config/kernel.m4 index df194ec72207..056517a841f2 100644 --- a/sys/contrib/openzfs/config/kernel.m4 +++ b/sys/contrib/openzfs/config/kernel.m4 @@ -1,1028 +1,1030 @@ dnl # dnl # Default ZFS kernel configuration dnl # AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ AM_COND_IF([BUILD_LINUX], [ dnl # Setup the kernel build environment. ZFS_AC_KERNEL ZFS_AC_QAT dnl # Sanity checks for module building and CONFIG_* defines ZFS_AC_KERNEL_CONFIG_DEFINED ZFS_AC_MODULE_SYMVERS dnl # Sequential ZFS_LINUX_TRY_COMPILE tests ZFS_AC_KERNEL_FPU_HEADER ZFS_AC_KERNEL_OBJTOOL_HEADER ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T ZFS_AC_KERNEL_MISC_MINOR ZFS_AC_KERNEL_DECLARE_EVENT_CLASS dnl # Parallel ZFS_LINUX_TEST_SRC / ZFS_LINUX_TEST_RESULT tests ZFS_AC_KERNEL_TEST_SRC ZFS_AC_KERNEL_TEST_RESULT AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ" ]) AC_SUBST(KERNEL_MAKE) ]) ]) dnl # dnl # Generate and compile all of the kernel API test cases to determine dnl # which interfaces are available. By invoking the kernel build system dnl # only once the compilation can be done in parallel significantly dnl # speeding up the process. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_OBJTOOL ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE ZFS_AC_KERNEL_SRC_PDE_DATA ZFS_AC_KERNEL_SRC_FALLOCATE ZFS_AC_KERNEL_SRC_FADVISE ZFS_AC_KERNEL_SRC_GENERIC_FADVISE ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_SRC_RWSEM ZFS_AC_KERNEL_SRC_SCHED ZFS_AC_KERNEL_SRC_USLEEP_RANGE ZFS_AC_KERNEL_SRC_KMEM_CACHE ZFS_AC_KERNEL_SRC_KVMALLOC ZFS_AC_KERNEL_SRC_VMALLOC_PAGE_KERNEL ZFS_AC_KERNEL_SRC_WAIT ZFS_AC_KERNEL_SRC_INODE_TIMES ZFS_AC_KERNEL_SRC_INODE_LOCK ZFS_AC_KERNEL_SRC_GROUP_INFO_GID ZFS_AC_KERNEL_SRC_RW ZFS_AC_KERNEL_SRC_TIMER_SETUP ZFS_AC_KERNEL_SRC_SUPER_USER_NS ZFS_AC_KERNEL_SRC_PROC_OPERATIONS ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_SRC_BIO ZFS_AC_KERNEL_SRC_BLKDEV ZFS_AC_KERNEL_SRC_BLK_QUEUE ZFS_AC_KERNEL_SRC_GENHD_FLAGS ZFS_AC_KERNEL_SRC_REVALIDATE_DISK ZFS_AC_KERNEL_SRC_GET_DISK_RO ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_SRC_XATTR ZFS_AC_KERNEL_SRC_ACL ZFS_AC_KERNEL_SRC_INODE_SETATTR ZFS_AC_KERNEL_SRC_INODE_GETATTR ZFS_AC_KERNEL_SRC_INODE_SET_FLAGS ZFS_AC_KERNEL_SRC_INODE_SET_IVERSION ZFS_AC_KERNEL_SRC_SHOW_OPTIONS ZFS_AC_KERNEL_SRC_FILE_INODE ZFS_AC_KERNEL_SRC_FILE_DENTRY ZFS_AC_KERNEL_SRC_FSYNC ZFS_AC_KERNEL_SRC_AIO_FSYNC ZFS_AC_KERNEL_SRC_EVICT_INODE ZFS_AC_KERNEL_SRC_DIRTY_INODE ZFS_AC_KERNEL_SRC_SHRINKER ZFS_AC_KERNEL_SRC_MKDIR ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS ZFS_AC_KERNEL_SRC_CREATE ZFS_AC_KERNEL_SRC_PERMISSION ZFS_AC_KERNEL_SRC_GET_LINK ZFS_AC_KERNEL_SRC_PUT_LINK ZFS_AC_KERNEL_SRC_TMPFILE ZFS_AC_KERNEL_SRC_AUTOMOUNT ZFS_AC_KERNEL_SRC_ENCODE_FH_WITH_INODE ZFS_AC_KERNEL_SRC_COMMIT_METADATA ZFS_AC_KERNEL_SRC_CLEAR_INODE ZFS_AC_KERNEL_SRC_SETATTR_PREPARE ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED ZFS_AC_KERNEL_SRC_DENTRY ZFS_AC_KERNEL_SRC_DENTRY_ALIAS_D_U ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SRC_SECURITY_INODE ZFS_AC_KERNEL_SRC_FST_MOUNT ZFS_AC_KERNEL_SRC_BDI ZFS_AC_KERNEL_SRC_SET_NLINK ZFS_AC_KERNEL_SRC_SGET ZFS_AC_KERNEL_SRC_LSEEK_EXECUTE ZFS_AC_KERNEL_SRC_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO ZFS_AC_KERNEL_SRC_VFS_GETATTR ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_ITERATE ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_READPAGES ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_SRC_VFS_IOV_ITER ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_FILE_OPERATIONS_EXTEND ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT ZFS_AC_KERNEL_SRC_FPU ZFS_AC_KERNEL_SRC_FMODE_T ZFS_AC_KERNEL_SRC_KUIDGID_T ZFS_AC_KERNEL_SRC_KUID_HELPERS ZFS_AC_KERNEL_SRC_RENAME ZFS_AC_KERNEL_SRC_CURRENT_TIME ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES ZFS_AC_KERNEL_SRC_IN_COMPAT_SYSCALL ZFS_AC_KERNEL_SRC_KTIME ZFS_AC_KERNEL_SRC_TOTALRAM_PAGES_FUNC ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES ZFS_AC_KERNEL_SRC_KSTRTOUL ZFS_AC_KERNEL_SRC_PERCPU ZFS_AC_KERNEL_SRC_CPU_HOTPLUG ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR ZFS_AC_KERNEL_SRC_MKNOD ZFS_AC_KERNEL_SRC_SYMLINK ZFS_AC_KERNEL_SRC_BIO_MAX_SEGS ZFS_AC_KERNEL_SRC_SIGNAL_STOP ZFS_AC_KERNEL_SRC_SIGINFO ZFS_AC_KERNEL_SRC_SYSFS ZFS_AC_KERNEL_SRC_SET_SPECIAL_STATE ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_SRC_PAGEMAP_FOLIO_WAIT_BIT ZFS_AC_KERNEL_SRC_ADD_DISK ZFS_AC_KERNEL_SRC_KTHREAD ZFS_AC_KERNEL_SRC_ZERO_PAGE ZFS_AC_KERNEL_SRC___COPY_FROM_USER_INATOMIC ZFS_AC_KERNEL_SRC_USER_NS_COMMON_INUM ZFS_AC_KERNEL_SRC_IDMAP_MNT_API ZFS_AC_KERNEL_SRC_IATTR_VFSID ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ + ZFS_AC_KERNEL_SRC_SYNC_BDEV case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE ;; esac AC_MSG_CHECKING([for available kernel interfaces]) ZFS_LINUX_TEST_COMPILE_ALL([kabi]) AC_MSG_RESULT([done]) ]) dnl # dnl # Check results of kernel interface tests. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_ACCESS_OK_TYPE ZFS_AC_KERNEL_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_OBJTOOL ZFS_AC_KERNEL_PDE_DATA ZFS_AC_KERNEL_FALLOCATE ZFS_AC_KERNEL_FADVISE ZFS_AC_KERNEL_GENERIC_FADVISE ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_RWSEM ZFS_AC_KERNEL_SCHED ZFS_AC_KERNEL_USLEEP_RANGE ZFS_AC_KERNEL_KMEM_CACHE ZFS_AC_KERNEL_KVMALLOC ZFS_AC_KERNEL_VMALLOC_PAGE_KERNEL ZFS_AC_KERNEL_WAIT ZFS_AC_KERNEL_INODE_TIMES ZFS_AC_KERNEL_INODE_LOCK ZFS_AC_KERNEL_GROUP_INFO_GID ZFS_AC_KERNEL_RW ZFS_AC_KERNEL_TIMER_SETUP ZFS_AC_KERNEL_SUPER_USER_NS ZFS_AC_KERNEL_PROC_OPERATIONS ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_BIO ZFS_AC_KERNEL_BLKDEV ZFS_AC_KERNEL_BLK_QUEUE ZFS_AC_KERNEL_GENHD_FLAGS ZFS_AC_KERNEL_REVALIDATE_DISK ZFS_AC_KERNEL_GET_DISK_RO ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_DISCARD_GRANULARITY ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_XATTR ZFS_AC_KERNEL_ACL ZFS_AC_KERNEL_INODE_SETATTR ZFS_AC_KERNEL_INODE_GETATTR ZFS_AC_KERNEL_INODE_SET_FLAGS ZFS_AC_KERNEL_INODE_SET_IVERSION ZFS_AC_KERNEL_SHOW_OPTIONS ZFS_AC_KERNEL_FILE_INODE ZFS_AC_KERNEL_FILE_DENTRY ZFS_AC_KERNEL_FSYNC ZFS_AC_KERNEL_AIO_FSYNC ZFS_AC_KERNEL_EVICT_INODE ZFS_AC_KERNEL_DIRTY_INODE ZFS_AC_KERNEL_SHRINKER ZFS_AC_KERNEL_MKDIR ZFS_AC_KERNEL_LOOKUP_FLAGS ZFS_AC_KERNEL_CREATE ZFS_AC_KERNEL_PERMISSION ZFS_AC_KERNEL_GET_LINK ZFS_AC_KERNEL_PUT_LINK ZFS_AC_KERNEL_TMPFILE ZFS_AC_KERNEL_AUTOMOUNT ZFS_AC_KERNEL_ENCODE_FH_WITH_INODE ZFS_AC_KERNEL_COMMIT_METADATA ZFS_AC_KERNEL_CLEAR_INODE ZFS_AC_KERNEL_SETATTR_PREPARE ZFS_AC_KERNEL_INSERT_INODE_LOCKED ZFS_AC_KERNEL_DENTRY ZFS_AC_KERNEL_DENTRY_ALIAS_D_U ZFS_AC_KERNEL_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SECURITY_INODE ZFS_AC_KERNEL_FST_MOUNT ZFS_AC_KERNEL_BDI ZFS_AC_KERNEL_SET_NLINK ZFS_AC_KERNEL_SGET ZFS_AC_KERNEL_LSEEK_EXECUTE ZFS_AC_KERNEL_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_VFS_READ_FOLIO ZFS_AC_KERNEL_VFS_GETATTR ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_ITERATE ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_READPAGES ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_VFS_RW_ITERATE ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_VFS_IOV_ITER ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE ZFS_AC_KERNEL_VFS_FILE_OPERATIONS_EXTEND ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_MAKE_REQUEST_FN ZFS_AC_KERNEL_GENERIC_IO_ACCT ZFS_AC_KERNEL_FPU ZFS_AC_KERNEL_FMODE_T ZFS_AC_KERNEL_KUIDGID_T ZFS_AC_KERNEL_KUID_HELPERS ZFS_AC_KERNEL_RENAME ZFS_AC_KERNEL_CURRENT_TIME ZFS_AC_KERNEL_USERNS_CAPABILITIES ZFS_AC_KERNEL_IN_COMPAT_SYSCALL ZFS_AC_KERNEL_KTIME ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC ZFS_AC_KERNEL_TOTALHIGH_PAGES ZFS_AC_KERNEL_KSTRTOUL ZFS_AC_KERNEL_PERCPU ZFS_AC_KERNEL_CPU_HOTPLUG ZFS_AC_KERNEL_GENERIC_FILLATTR ZFS_AC_KERNEL_MKNOD ZFS_AC_KERNEL_SYMLINK ZFS_AC_KERNEL_BIO_MAX_SEGS ZFS_AC_KERNEL_SIGNAL_STOP ZFS_AC_KERNEL_SIGINFO ZFS_AC_KERNEL_SYSFS ZFS_AC_KERNEL_SET_SPECIAL_STATE ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_PAGEMAP_FOLIO_WAIT_BIT ZFS_AC_KERNEL_ADD_DISK ZFS_AC_KERNEL_KTHREAD ZFS_AC_KERNEL_ZERO_PAGE ZFS_AC_KERNEL___COPY_FROM_USER_INATOMIC ZFS_AC_KERNEL_USER_NS_COMMON_INUM ZFS_AC_KERNEL_IDMAP_MNT_API ZFS_AC_KERNEL_IATTR_VFSID ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_COPY_SPLICE_READ + ZFS_AC_KERNEL_SYNC_BDEV case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE ;; esac ]) dnl # dnl # Detect name used for Module.symvers file in kernel dnl # AC_DEFUN([ZFS_AC_MODULE_SYMVERS], [ modpost=$LINUX/scripts/Makefile.modpost AC_MSG_CHECKING([kernel file name for module symbols]) AS_IF([test "x$enable_linux_builtin" != xyes -a -f "$modpost"], [ AS_IF([grep -q Modules.symvers $modpost], [ LINUX_SYMBOLS=Modules.symvers ], [ LINUX_SYMBOLS=Module.symvers ]) AS_IF([test ! -f "$LINUX_OBJ/$LINUX_SYMBOLS"], [ AC_MSG_ERROR([ *** Please make sure the kernel devel package for your distribution *** is installed. If you are building with a custom kernel, make sure *** the kernel is configured, built, and the '--with-linux=PATH' *** configure option refers to the location of the kernel source. ]) ]) ], [ LINUX_SYMBOLS=NONE ]) AC_MSG_RESULT($LINUX_SYMBOLS) AC_SUBST(LINUX_SYMBOLS) ]) dnl # dnl # Detect the kernel to be built against dnl # dnl # Most modern Linux distributions have separate locations for bare dnl # source (source) and prebuilt (build) files. Additionally, there are dnl # `source` and `build` symlinks in `/lib/modules/$(KERNEL_VERSION)` dnl # pointing to them. The directory search order is now: dnl # dnl # - `configure` command line values if both `--with-linux` and dnl # `--with-linux-obj` were defined dnl # dnl # - If only `--with-linux` was defined, `--with-linux-obj` is assumed dnl # to have the same value as `--with-linux` dnl # dnl # - If neither `--with-linux` nor `--with-linux-obj` were defined dnl # autodetection is used: dnl # dnl # - `/lib/modules/$(uname -r)/{source,build}` respectively, if exist. dnl # dnl # - If only `/lib/modules/$(uname -r)/build` exists, it is assumed dnl # to be both source and build directory. dnl # dnl # - The first directory in `/lib/modules` with the highest version dnl # number according to `sort -V` which contains both `source` and dnl # `build` symlinks/directories. If module directory contains only dnl # `build` component, it is assumed to be both source and build dnl # directory. dnl # dnl # - Last resort: the first directory matching `/usr/src/kernels/*` dnl # and `/usr/src/linux-*` with the highest version number according dnl # to `sort -V` is assumed to be both source and build directory. dnl # AC_DEFUN([ZFS_AC_KERNEL], [ AC_ARG_WITH([linux], AS_HELP_STRING([--with-linux=PATH], [Path to kernel source]), [kernelsrc="$withval"]) AC_ARG_WITH(linux-obj, AS_HELP_STRING([--with-linux-obj=PATH], [Path to kernel build objects]), [kernelbuild="$withval"]) AC_MSG_CHECKING([kernel source and build directories]) AS_IF([test -n "$kernelsrc" && test -z "$kernelbuild"], [ kernelbuild="$kernelsrc" ], [test -z "$kernelsrc"], [ AS_IF([test -e "/lib/modules/$(uname -r)/source" && \ test -e "/lib/modules/$(uname -r)/build"], [ src="/lib/modules/$(uname -r)/source" build="/lib/modules/$(uname -r)/build" ], [test -e "/lib/modules/$(uname -r)/build"], [ build="/lib/modules/$(uname -r)/build" src="$build" ], [ src= for d in $(ls -1d /lib/modules/* 2>/dev/null | sort -Vr); do if test -e "$d/source" && test -e "$d/build"; then src="$d/source" build="$d/build" break fi if test -e "$d/build"; then src="$d/build" build="$d/build" break fi done # the least reliable method if test -z "$src"; then src=$(ls -1d /usr/src/kernels/* /usr/src/linux-* \ 2>/dev/null | grep -v obj | sort -Vr | head -1) build="$src" fi ]) AS_IF([test -n "$src" && test -e "$src"], [ kernelsrc=$(readlink -e "$src") ], [ kernelsrc="[Not found]" ]) AS_IF([test -n "$build" && test -e "$build"], [ kernelbuild=$(readlink -e "$build") ], [ kernelbuild="[Not found]" ]) ], [ AS_IF([test "$kernelsrc" = "NONE"], [ kernsrcver=NONE ]) withlinux=yes ]) AC_MSG_RESULT([done]) AC_MSG_CHECKING([kernel source directory]) AC_MSG_RESULT([$kernelsrc]) AC_MSG_CHECKING([kernel build directory]) AC_MSG_RESULT([$kernelbuild]) AS_IF([test ! -d "$kernelsrc" || test ! -d "$kernelbuild"], [ AC_MSG_ERROR([ *** Please make sure the kernel devel package for your distribution *** is installed and then try again. If that fails, you can specify the *** location of the kernel source and build with the '--with-linux=PATH' and *** '--with-linux-obj=PATH' options respectively.]) ]) AC_MSG_CHECKING([kernel source version]) utsrelease1=$kernelbuild/include/linux/version.h utsrelease2=$kernelbuild/include/linux/utsrelease.h utsrelease3=$kernelbuild/include/generated/utsrelease.h AS_IF([test -r $utsrelease1 && grep -qF UTS_RELEASE $utsrelease1], [ utsrelease=$utsrelease1 ], [test -r $utsrelease2 && grep -qF UTS_RELEASE $utsrelease2], [ utsrelease=$utsrelease2 ], [test -r $utsrelease3 && grep -qF UTS_RELEASE $utsrelease3], [ utsrelease=$utsrelease3 ]) AS_IF([test -n "$utsrelease"], [ kernsrcver=$($AWK '/UTS_RELEASE/ { gsub(/"/, "", $[3]); print $[3] }' $utsrelease) AS_IF([test -z "$kernsrcver"], [ AC_MSG_RESULT([Not found]) AC_MSG_ERROR([ *** Cannot determine kernel version. ]) ]) ], [ AC_MSG_RESULT([Not found]) if test "x$enable_linux_builtin" != xyes; then AC_MSG_ERROR([ *** Cannot find UTS_RELEASE definition. ]) else AC_MSG_ERROR([ *** Cannot find UTS_RELEASE definition. *** Please run 'make prepare' inside the kernel source tree.]) fi ]) AC_MSG_RESULT([$kernsrcver]) AS_VERSION_COMPARE([$kernsrcver], [$ZFS_META_KVER_MIN], [ AC_MSG_ERROR([ *** Cannot build against kernel version $kernsrcver. *** The minimum supported kernel version is $ZFS_META_KVER_MIN. ]) ]) LINUX=${kernelsrc} LINUX_OBJ=${kernelbuild} LINUX_VERSION=${kernsrcver} AC_SUBST(LINUX) AC_SUBST(LINUX_OBJ) AC_SUBST(LINUX_VERSION) ]) dnl # dnl # Detect the QAT module to be built against, QAT provides hardware dnl # acceleration for data compression: dnl # dnl # https://01.org/intel-quickassist-technology dnl # dnl # 1) Download and install QAT driver from the above link dnl # 2) Start QAT driver in your system: dnl # service qat_service start dnl # 3) Enable QAT in ZFS, e.g.: dnl # ./configure --with-qat=/QAT1.6 dnl # make dnl # 4) Set GZIP compression in ZFS dataset: dnl # zfs set compression = gzip dnl # dnl # Then the data written to this ZFS pool is compressed by QAT accelerator dnl # automatically, and de-compressed by QAT when read from the pool. dnl # dnl # 1) Get QAT hardware statistics with: dnl # cat /proc/icp_dh895xcc_dev/qat dnl # 2) To disable QAT: dnl # insmod zfs.ko zfs_qat_disable=1 dnl # AC_DEFUN([ZFS_AC_QAT], [ AC_ARG_WITH([qat], AS_HELP_STRING([--with-qat=PATH], [Path to qat source]), AS_IF([test "$withval" = "yes"], AC_MSG_ERROR([--with-qat=PATH requires a PATH]), [qatsrc="$withval"])) AC_ARG_WITH([qat-obj], AS_HELP_STRING([--with-qat-obj=PATH], [Path to qat build objects]), [qatbuild="$withval"]) AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat source directory]) AC_MSG_RESULT([$qatsrc]) QAT_SRC="${qatsrc}/quickassist" AS_IF([ test ! -e "$QAT_SRC/include/cpa.h"], [ AC_MSG_ERROR([ *** Please make sure the qat driver package is installed *** and specify the location of the qat source with the *** '--with-qat=PATH' option then try again. Failed to *** find cpa.h in: ${QAT_SRC}/include]) ]) ]) AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat build directory]) AS_IF([test -z "$qatbuild"], [ qatbuild="${qatsrc}/build" ]) AC_MSG_RESULT([$qatbuild]) QAT_OBJ=${qatbuild} AS_IF([ ! test -e "$QAT_OBJ/icp_qa_al.ko" && ! test -e "$QAT_OBJ/qat_api.ko"], [ AC_MSG_ERROR([ *** Please make sure the qat driver is installed then try again. *** Failed to find icp_qa_al.ko or qat_api.ko in: $QAT_OBJ]) ]) AC_SUBST(QAT_SRC) AC_SUBST(QAT_OBJ) AC_DEFINE(HAVE_QAT, 1, [qat is enabled and existed]) ]) dnl # dnl # Detect the name used for the QAT Module.symvers file. dnl # AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat file for module symbols]) QAT_SYMBOLS=$QAT_SRC/lookaside/access_layer/src/Module.symvers AS_IF([test -r $QAT_SYMBOLS], [ AC_MSG_RESULT([$QAT_SYMBOLS]) AC_SUBST(QAT_SYMBOLS) ],[ AC_MSG_ERROR([ *** Please make sure the qat driver is installed then try again. *** Failed to find Module.symvers in: $QAT_SYMBOLS ]) ]) ]) ]) dnl # dnl # ZFS_LINUX_CONFTEST_H dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_H], [ test -d build/$2 || mkdir -p build/$2 cat - <<_ACEOF >build/$2/$2.h $1 _ACEOF ]) dnl # dnl # ZFS_LINUX_CONFTEST_C dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_C], [ test -d build/$2 || mkdir -p build/$2 cat confdefs.h - <<_ACEOF >build/$2/$2.c $1 _ACEOF ]) dnl # dnl # ZFS_LINUX_CONFTEST_MAKEFILE dnl # dnl # $1 - test case name dnl # $2 - add to top-level Makefile dnl # $3 - additional build flags dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_MAKEFILE], [ test -d build || mkdir -p build test -d build/$1 || mkdir -p build/$1 file=build/$1/Makefile dnl # Example command line to manually build source. cat - <<_ACEOF >$file # Example command line to manually build source # make modules -C $LINUX_OBJ $ARCH_UM M=$PWD/build/$1 ccflags-y := -Werror $FRAME_LARGER_THAN _ACEOF dnl # Additional custom CFLAGS as requested. m4_ifval($3, [echo "ccflags-y += $3" >>$file], []) dnl # Test case source echo "obj-m := $1.o" >>$file AS_IF([test "x$2" = "xyes"], [echo "obj-m += $1/" >>build/Makefile], []) ]) dnl # dnl # ZFS_LINUX_TEST_PROGRAM(C)([PROLOGUE], [BODY]) dnl # m4_define([ZFS_LINUX_TEST_PROGRAM], [ #include $1 int main (void) { $2 ; return 0; } MODULE_DESCRIPTION("conftest"); MODULE_AUTHOR(ZFS_META_AUTHOR); MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); MODULE_LICENSE($3); ]) dnl # dnl # ZFS_LINUX_TEST_REMOVE dnl # dnl # Removes the specified test source and results. dnl # AC_DEFUN([ZFS_LINUX_TEST_REMOVE], [ test -d build/$1 && rm -Rf build/$1 test -f build/Makefile && sed '/$1/d' build/Makefile ]) dnl # dnl # ZFS_LINUX_COMPILE dnl # dnl # $1 - build dir dnl # $2 - test command dnl # $3 - pass command dnl # $4 - fail command dnl # $5 - set KBUILD_MODPOST_NOFINAL='yes' dnl # $6 - set KBUILD_MODPOST_WARN='yes' dnl # dnl # Used internally by ZFS_LINUX_TEST_{COMPILE,MODPOST} dnl # AC_DEFUN([ZFS_LINUX_COMPILE], [ AC_ARG_VAR([KERNEL_CC], [C compiler for building kernel modules]) AC_ARG_VAR([KERNEL_LD], [Linker for building kernel modules]) AC_ARG_VAR([KERNEL_LLVM], [Binary option to build kernel modules with LLVM/CLANG toolchain]) AC_TRY_COMMAND([ KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6" make modules -k -j$TEST_JOBS ${KERNEL_CC:+CC=$KERNEL_CC} ${KERNEL_LD:+LD=$KERNEL_LD} ${KERNEL_LLVM:+LLVM=$KERNEL_LLVM} CONFIG_MODULES=y CFLAGS_MODULE=-DCONFIG_MODULES -C $LINUX_OBJ $ARCH_UM M=$PWD/$1 >$1/build.log 2>&1]) AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4]) ]) dnl # dnl # ZFS_LINUX_TEST_COMPILE dnl # dnl # Perform a full compile excluding the final modpost phase. dnl # AC_DEFUN([ZFS_LINUX_TEST_COMPILE], [ ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ mv $2/Makefile $2/Makefile.compile.$1 mv $2/build.log $2/build.log.$1 ],[ AC_MSG_ERROR([ *** Unable to compile test source to determine kernel interfaces.]) ], [yes], []) ]) dnl # dnl # ZFS_LINUX_TEST_MODPOST dnl # dnl # Perform a full compile including the modpost phase. This may dnl # be an incremental build if the objects have already been built. dnl # AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [ ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ mv $2/Makefile $2/Makefile.modpost.$1 cat $2/build.log >>build/build.log.$1 ],[ AC_MSG_ERROR([ *** Unable to modpost test source to determine kernel interfaces.]) ], [], [yes]) ]) dnl # dnl # Perform the compilation of the test cases in two phases. dnl # dnl # Phase 1) attempt to build the object files for all of the tests dnl # defined by the ZFS_LINUX_TEST_SRC macro. But do not dnl # perform the final modpost stage. dnl # dnl # Phase 2) disable all tests which failed the initial compilation, dnl # then invoke the final modpost step for the remaining tests. dnl # dnl # This allows us efficiently build the test cases in parallel while dnl # remaining resilient to build failures which are expected when dnl # detecting the available kernel interfaces. dnl # dnl # The maximum allowed parallelism can be controlled by setting the dnl # TEST_JOBS environment variable. Otherwise, it default to $(nproc). dnl # AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [ dnl # Phase 1 - Compilation only, final linking is skipped. ZFS_LINUX_TEST_COMPILE([$1], [build]) dnl # dnl # Phase 2 - When building external modules disable test cases dnl # which failed to compile and invoke modpost to verify the dnl # final linking. dnl # dnl # Test names suffixed with '_license' call modpost independently dnl # to ensure that a single incompatibility does not result in the dnl # modpost phase exiting early. This check is not performed on dnl # every symbol since the majority are compatible and doing so dnl # would significantly slow down this phase. dnl # dnl # When configuring for builtin (--enable-linux-builtin) dnl # fake the linking step artificially create the expected .ko dnl # files for tests which did compile. This is required for dnl # kernels which do not have loadable module support or have dnl # not yet been built. dnl # AS_IF([test "x$enable_linux_builtin" = "xno"], [ for dir in $(awk '/^obj-m/ { print [$]3 }' \ build/Makefile.compile.$1); do name=${dir%/} AS_IF([test -f build/$name/$name.o], [ AS_IF([test "${name##*_}" = "license"], [ ZFS_LINUX_TEST_MODPOST([$1], [build/$name]) echo "obj-n += $dir" >>build/Makefile ], [ echo "obj-m += $dir" >>build/Makefile ]) ], [ echo "obj-n += $dir" >>build/Makefile ]) done ZFS_LINUX_TEST_MODPOST([$1], [build]) ], [ for dir in $(awk '/^obj-m/ { print [$]3 }' \ build/Makefile.compile.$1); do name=${dir%/} AS_IF([test -f build/$name/$name.o], [ touch build/$name/$name.ko ]) done ]) ]) dnl # dnl # ZFS_LINUX_TEST_SRC dnl # dnl # $1 - name dnl # $2 - global dnl # $3 - source dnl # $4 - extra cflags dnl # $5 - check license-compatibility dnl # dnl # Check if the test source is buildable at all and then if it is dnl # license compatible. dnl # dnl # N.B because all of the test cases are compiled in parallel they dnl # must never depend on the results of previous tests. Each test dnl # needs to be entirely independent. dnl # AC_DEFUN([ZFS_LINUX_TEST_SRC], [ ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[$2]], [[$3]], [["Dual BSD/GPL"]])], [$1]) ZFS_LINUX_CONFTEST_MAKEFILE([$1], [yes], [$4]) AS_IF([ test -n "$5" ], [ ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM( [[$2]], [[$3]], [[$5]])], [$1_license]) ZFS_LINUX_CONFTEST_MAKEFILE([$1_license], [yes], [$4]) ]) ]) dnl # dnl # ZFS_LINUX_TEST_RESULT dnl # dnl # $1 - name of a test source (ZFS_LINUX_TEST_SRC) dnl # $2 - run on success (valid .ko generated) dnl # $3 - run on failure (unable to compile) dnl # AC_DEFUN([ZFS_LINUX_TEST_RESULT], [ AS_IF([test -d build/$1], [ AS_IF([test -f build/$1/$1.ko], [$2], [$3]) ], [ AC_MSG_ERROR([ *** No matching source for the "$1" test, check that *** both the test source and result macros refer to the same name. ]) ]) ]) dnl # dnl # ZFS_LINUX_TEST_ERROR dnl # dnl # Generic error message which can be used when none of the expected dnl # kernel interfaces were detected. dnl # AC_DEFUN([ZFS_LINUX_TEST_ERROR], [ AC_MSG_ERROR([ *** None of the expected "$1" interfaces were detected. *** This may be because your kernel version is newer than what is *** supported, or you are using a patched custom kernel with *** incompatible modifications. *** *** ZFS Version: $ZFS_META_ALIAS *** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX ]) ]) dnl # dnl # ZFS_LINUX_TEST_RESULT_SYMBOL dnl # dnl # Like ZFS_LINUX_TEST_RESULT except ZFS_CHECK_SYMBOL_EXPORT is called to dnl # verify symbol exports, unless --enable-linux-builtin was provided to dnl # configure. dnl # AC_DEFUN([ZFS_LINUX_TEST_RESULT_SYMBOL], [ AS_IF([ ! test -f build/$1/$1.ko], [ $5 ], [ AS_IF([test "x$enable_linux_builtin" != "xyes"], [ ZFS_CHECK_SYMBOL_EXPORT([$2], [$3], [$4], [$5]) ], [ $4 ]) ]) ]) dnl # dnl # ZFS_LINUX_COMPILE_IFELSE dnl # AC_DEFUN([ZFS_LINUX_COMPILE_IFELSE], [ ZFS_LINUX_TEST_REMOVE([conftest]) m4_ifvaln([$1], [ZFS_LINUX_CONFTEST_C([$1], [conftest])]) m4_ifvaln([$5], [ZFS_LINUX_CONFTEST_H([$5], [conftest])], [ZFS_LINUX_CONFTEST_H([], [conftest])]) ZFS_LINUX_CONFTEST_MAKEFILE([conftest], [no], [m4_ifvaln([$5], [-I$PWD/build/conftest], [])]) ZFS_LINUX_COMPILE([build/conftest], [$2], [$3], [$4], [], []) ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE dnl # dnl # $1 - global dnl # $2 - source dnl # $3 - run on success (valid .ko generated) dnl # $4 - run on failure (unable to compile) dnl # dnl # When configuring as builtin (--enable-linux-builtin) for kernels dnl # without loadable module support (CONFIG_MODULES=n) only the object dnl # file is created. See ZFS_LINUX_TEST_COMPILE_ALL for details. dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE], [ AS_IF([test "x$enable_linux_builtin" = "xyes"], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.o], [$3], [$4]) ], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4]) ]) ]) dnl # dnl # ZFS_CHECK_SYMBOL_EXPORT dnl # dnl # Check if a symbol is exported on not by consulting the symbols dnl # file, or optionally the source code. dnl # AC_DEFUN([ZFS_CHECK_SYMBOL_EXPORT], [ grep -q -E '[[[:space:]]]$1[[[:space:]]]' \ $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null rc=$? if test $rc -ne 0; then export=0 for file in $2; do grep -q -E "EXPORT_SYMBOL.*($1)" \ "$LINUX/$file" 2>/dev/null rc=$? if test $rc -eq 0; then export=1 break; fi done if test $export -eq 0; then : $4 else : $3 fi else : $3 fi ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE_SYMBOL dnl # dnl # Like ZFS_LINUX_TRY_COMPILER except ZFS_CHECK_SYMBOL_EXPORT is called dnl # to verify symbol exports, unless --enable-linux-builtin was provided dnl # to configure. dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_SYMBOL], [ ZFS_LINUX_TRY_COMPILE([$1], [$2], [rc=0], [rc=1]) if test $rc -ne 0; then : $6 else if test "x$enable_linux_builtin" != xyes; then ZFS_CHECK_SYMBOL_EXPORT([$3], [$4], [rc=0], [rc=1]) fi if test $rc -ne 0; then : $6 else : $5 fi fi ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE_HEADER dnl # like ZFS_LINUX_TRY_COMPILE, except the contents conftest.h are dnl # provided via the fifth parameter dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], [ AS_IF([test "x$enable_linux_builtin" = "xyes"], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.o], [$3], [$4], [$5]) ], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4], [$5]) ]) ]) dnl # dnl # AS_VERSION_COMPARE_LE dnl # like AS_VERSION_COMPARE_LE, but runs $3 if (and only if) $1 <= $2 dnl # AS_VERSION_COMPARE_LE (version-1, version-2, [action-if-less-or-equal], [action-if-greater]) dnl # AC_DEFUN([AS_VERSION_COMPARE_LE], [ AS_VERSION_COMPARE([$1], [$2], [$3], [$3], [$4]) ]) dnl # dnl # ZFS_LINUX_REQUIRE_API dnl # like ZFS_LINUX_TEST_ERROR, except only fails if the kernel is dnl # at least some specified version. dnl # AC_DEFUN([ZFS_LINUX_REQUIRE_API], [ AS_VERSION_COMPARE_LE([$2], [$kernsrcver], [ AC_MSG_ERROR([ *** None of the expected "$1" interfaces were detected. This *** interface is expected for kernels version "$2" and above. *** This may be because your kernel version is newer than what is *** supported, or you are using a patched custom kernel with *** incompatible modifications. Newer kernels may have incompatible *** APIs. *** *** ZFS Version: $ZFS_META_ALIAS *** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX ]) ], [ AC_MSG_RESULT(no) ]) ]) diff --git a/sys/contrib/openzfs/configure.ac b/sys/contrib/openzfs/configure.ac index 4c75616e4299..f31fe1db81e4 100644 --- a/sys/contrib/openzfs/configure.ac +++ b/sys/contrib/openzfs/configure.ac @@ -1,88 +1,89 @@ /* * This file is part of OpenZFS. * * Copyright (c) 2009 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory * Written by: * Brian Behlendorf , * Herb Wartens , * Jim Garlick * LLNL-CODE-403049 * * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ AC_INIT(m4_esyscmd(awk '/^Name:/ {printf $2}' META), m4_esyscmd(awk '/^Version:/ {printf $2}' META)) CFGOPTS="$*" AC_LANG(C) ZFS_AC_META AC_CONFIG_AUX_DIR([config]) AC_CONFIG_MACRO_DIR([config]) AC_CANONICAL_TARGET AM_MAINTAINER_MODE m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) AM_INIT_AUTOMAKE([subdir-objects foreign]) # Remove default macros from config.h: # PACKAGE, PACKAGE_{BUGREPORT,NAME,STRING,TARNAME,VERSION}, STDC_HEADERS, VERSION AC_CONFIG_HEADERS([zfs_config.h], [ sed -nri~ -e '/^$/be' -e 'N;N;/#define (PACKAGE|VERSION|STDC_HEADERS)/d' -e ':e' -e 'p' zfs_config.h && rm zfs_config.h~ || exit]) LT_INIT AC_PROG_INSTALL AC_PROG_CC AC_PROG_LN_S PKG_PROG_PKG_CONFIG AM_PROG_AS AM_PROG_CC_C_O AX_CODE_COVERAGE _AM_PROG_TAR(pax) ZFS_AC_LICENSE ZFS_AC_CONFIG ZFS_AC_PACKAGE ZFS_AC_DEBUG ZFS_AC_DEBUGINFO ZFS_AC_DEBUG_KMEM ZFS_AC_DEBUG_KMEM_TRACKING ZFS_AC_DEBUG_INVARIANTS AC_CONFIG_FILES([ contrib/debian/rules + contrib/debian/changelog Makefile include/Makefile lib/libzfs/libzfs.pc lib/libzfs_core/libzfs_core.pc lib/libzfsbootenv/libzfsbootenv.pc module/Kbuild module/Makefile rpm/generic/zfs-dkms.spec rpm/generic/zfs-kmod.spec rpm/generic/zfs.spec rpm/redhat/zfs-dkms.spec rpm/redhat/zfs-kmod.spec rpm/redhat/zfs.spec tests/zfs-tests/tests/Makefile zfs.release ]) AC_OUTPUT diff --git a/sys/contrib/openzfs/contrib/debian/changelog b/sys/contrib/openzfs/contrib/debian/changelog.in similarity index 74% rename from sys/contrib/openzfs/contrib/debian/changelog rename to sys/contrib/openzfs/contrib/debian/changelog.in index ba42ea59fa8d..525519a73d08 100644 --- a/sys/contrib/openzfs/contrib/debian/changelog +++ b/sys/contrib/openzfs/contrib/debian/changelog.in @@ -1,13 +1,19 @@ +openzfs-linux (@VERSION@-1) unstable; urgency=low + + * OpenZFS @VERSION@ is tagged. + + -- Umer Saleem Wed, 15 Nov 2023 15:00:00 +0500 + openzfs-linux (2.2.0-0) unstable; urgency=low * OpenZFS 2.2.0 is tagged. -- Umer Saleem Tue, 25 Jul 2023 15:00:00 +0500 openzfs-linux (2.1.99-1) unstable; urgency=low * Integrate minimally modified Debian packaging from ZFS on Linux (source: https://salsa.debian.org/zfsonlinux-team/zfs) * This packaging is a fork of Debian zfs-linux 2.1.6-2 release. -- Umer Saleem Fri, 11 Oct 2022 15:00:00 -0400 diff --git a/sys/contrib/openzfs/contrib/debian/control b/sys/contrib/openzfs/contrib/debian/control index f4e97fe16145..98beb900d0fa 100644 --- a/sys/contrib/openzfs/contrib/debian/control +++ b/sys/contrib/openzfs/contrib/debian/control @@ -1,327 +1,326 @@ Source: openzfs-linux Section: contrib/kernel Priority: optional Maintainer: ZFS on Linux specific mailing list Build-Depends: debhelper-compat (= 12), dh-python, dh-sequence-dkms | dkms (>> 2.1.1.2-5), libaio-dev, libblkid-dev, libcurl4-openssl-dev, libelf-dev, libpam0g-dev, libssl-dev | libssl1.0-dev, libtool, libudev-dev, lsb-release, po-debconf, python3-all-dev, python3-cffi, python3-setuptools, python3-sphinx, uuid-dev, zlib1g-dev Standards-Version: 4.5.1 Homepage: https://openzfs.org/ Vcs-Git: https://github.com/openzfs/zfs.git Vcs-Browser: https://github.com/openzfs/zfs Rules-Requires-Root: no XS-Autobuild: yes Package: openzfs-libnvpair3 Section: contrib/libs Architecture: linux-any Depends: ${misc:Depends}, ${shlibs:Depends} Breaks: libnvpair1, libnvpair3 Replaces: libnvpair1, libnvpair3, libnvpair3linux Conflicts: libnvpair3linux Description: Solaris name-value library for Linux This library provides routines for packing and unpacking nv pairs for transporting data across process boundaries, transporting between kernel and userland, and possibly saving onto disk files. Package: openzfs-libpam-zfs Section: contrib/admin Architecture: linux-any Depends: libpam-runtime, ${misc:Depends}, ${shlibs:Depends} Replaces: libpam-zfs Conflicts: libpam-zfs Description: PAM module for managing encryption keys for ZFS OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, compression, encryption, snapshots, and more. . This provides a Pluggable Authentication Module (PAM) that automatically unlocks encrypted ZFS datasets upon login. Package: openzfs-libuutil3 Section: contrib/libs Architecture: linux-any Depends: ${misc:Depends}, ${shlibs:Depends} Breaks: libuutil1, libuutil3 Replaces: libuutil1, libuutil3, libuutil3linux Conflicts: libuutil3linux Description: Solaris userland utility library for Linux This library provides a variety of glue functions for ZFS on Linux: * libspl: The Solaris Porting Layer userland library, which provides APIs that make it possible to run Solaris user code in a Linux environment with relatively minimal modification. * libavl: The Adelson-Velskii Landis balanced binary tree manipulation library. * libefi: The Extensible Firmware Interface library for GUID disk partitioning. * libshare: NFS, SMB, and iSCSI service integration for ZFS. Package: openzfs-libzfs-dev Section: contrib/libdevel Architecture: linux-any Depends: libssl-dev | libssl1.0-dev, openzfs-libnvpair3 (= ${binary:Version}), openzfs-libuutil3 (= ${binary:Version}), openzfs-libzfs4 (= ${binary:Version}), openzfs-libzfsbootenv1 (= ${binary:Version}), openzfs-libzpool5 (= ${binary:Version}), ${misc:Depends} Replaces: libzfslinux-dev Conflicts: libzfslinux-dev Provides: libnvpair-dev, libuutil-dev Description: OpenZFS filesystem development files for Linux Header files and static libraries for compiling software against libraries of OpenZFS filesystem. . This package includes the development files of libnvpair3, libuutil3, libzpool5 and libzfs4. Package: openzfs-libzfs4 Section: contrib/libs Architecture: linux-any Depends: ${misc:Depends}, ${shlibs:Depends} # The libcurl4 is loaded through dlopen("libcurl.so.4"). # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=988521 Recommends: libcurl4 Breaks: libzfs2, libzfs4 Replaces: libzfs2, libzfs4, libzfs4linux Conflicts: libzfs4linux Description: OpenZFS filesystem library for Linux - general support OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, compression, encryption, snapshots, and more. . The OpenZFS library provides support for managing OpenZFS filesystems. Package: openzfs-libzfsbootenv1 Section: contrib/libs Architecture: linux-any Depends: ${misc:Depends}, ${shlibs:Depends} Breaks: libzfs2, libzfs4 Replaces: libzfs2, libzfs4, libzfsbootenv1linux Conflicts: libzfsbootenv1linux Description: OpenZFS filesystem library for Linux - label info support OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, compression, encryption, snapshots, and more. . The zfsbootenv library provides support for modifying ZFS label information. Package: openzfs-libzpool5 Section: contrib/libs Architecture: linux-any Depends: ${misc:Depends}, ${shlibs:Depends} Breaks: libzpool2, libzpool5 Replaces: libzpool2, libzpool5, libzpool5linux Conflicts: libzpool5linux Description: OpenZFS pool library for Linux OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, compression, encryption, snapshots, and more. . This zpool library provides support for managing zpools. Package: openzfs-python3-pyzfs Section: contrib/python Architecture: linux-any Depends: python3-cffi, openzfs-zfsutils (= ${binary:Version}), ${misc:Depends}, ${python3:Depends} Replaces: python3-pyzfs Conflicts: python3-pyzfs Description: wrapper for libzfs_core C library libzfs_core is intended to be a stable interface for programmatic administration of ZFS. This wrapper provides one-to-one wrappers for libzfs_core API functions, but the signatures and types are more natural to Python. . nvlists are wrapped as dictionaries or lists depending on their usage. Some parameters have default values depending on typical use for increased convenience. Enumerations and bit flags become strings and lists of strings in Python. Errors are reported as exceptions rather than integer errno-style error codes. The wrapper takes care to provide one-to-many mapping of the error codes to the exceptions by interpreting a context in which the error code is produced. Package: openzfs-pyzfs-doc Section: contrib/doc Architecture: all Depends: ${misc:Depends}, ${sphinxdoc:Depends} Recommends: openzfs-python3-pyzfs Replaces: pyzfs-doc Conflicts: pyzfs-doc Description: wrapper for libzfs_core C library (documentation) libzfs_core is intended to be a stable interface for programmatic administration of ZFS. This wrapper provides one-to-one wrappers for libzfs_core API functions, but the signatures and types are more natural to Python. . nvlists are wrapped as dictionaries or lists depending on their usage. Some parameters have default values depending on typical use for increased convenience. Enumerations and bit flags become strings and lists of strings in Python. Errors are reported as exceptions rather than integer errno-style error codes. The wrapper takes care to provide one-to-many mapping of the error codes to the exceptions by interpreting a context in which the error code is produced. . This package contains the documentation. Package: openzfs-zfs-dkms Architecture: all Depends: dkms (>> 2.1.1.2-5), file, libc6-dev | libc-dev, lsb-release, python3-distutils | libpython3-stdlib (<< 3.6.4), ${misc:Depends}, ${perl:Depends} Recommends: openzfs-zfs-zed, openzfs-zfsutils (>= ${source:Version}), ${linux:Recommends} # suggests debhelper because e.g. `dkms mkdeb -m zfs -v 0.8.2` needs dh_testdir (#909183) Suggests: debhelper Breaks: spl-dkms (<< 0.8.0~rc1) Replaces: spl-dkms, zfs-dkms -Conflicts: zfs-dkms Provides: openzfs-zfs-modules Description: OpenZFS filesystem kernel modules for Linux OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, compression, encryption, snapshots, and more. . This DKMS package includes the SPA, DMU, ZVOL, and ZPL components of OpenZFS. Package: openzfs-zfs-initramfs Architecture: all Depends: busybox-initramfs | busybox-static | busybox, initramfs-tools, openzfs-zfs-modules | openzfs-zfs-dkms, openzfs-zfsutils (>= ${source:Version}), ${misc:Depends} Breaks: zfsutils-linux (<= 0.7.11-2) Replaces: zfsutils-linux (<= 0.7.11-2), zfs-initramfs Conflicts: zfs-initramfs Description: OpenZFS root filesystem capabilities for Linux - initramfs OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, compression, encryption, snapshots, and more. . This package adds OpenZFS to the system initramfs with a hook for the initramfs-tools infrastructure. Package: openzfs-zfs-dracut Architecture: all Depends: dracut, openzfs-zfs-modules | openzfs-zfs-dkms, openzfs-zfsutils (>= ${source:Version}), ${misc:Depends} Conflicts: zfs-dracut Replaces: zfs-dracut Description: OpenZFS root filesystem capabilities for Linux - dracut OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, compression, encryption, snapshots, and more. . This package adds OpenZFS to the system initramfs with a hook for the dracut infrastructure. Package: openzfs-zfsutils Section: contrib/admin Architecture: linux-any Pre-Depends: ${misc:Pre-Depends} Depends: openzfs-libnvpair3 (= ${binary:Version}), openzfs-libuutil3 (= ${binary:Version}), openzfs-libzfs4 (= ${binary:Version}), openzfs-libzpool5 (= ${binary:Version}), python3, ${misc:Depends}, ${shlibs:Depends} Recommends: lsb-base, openzfs-zfs-modules | openzfs-zfs-dkms, openzfs-zfs-zed Breaks: openrc, spl (<< 0.7.9-2), spl-dkms (<< 0.8.0~rc1), openzfs-zfs-dkms (<< ${source:Version}), openzfs-zfs-dkms (>> ${source:Version}...) Replaces: spl (<< 0.7.9-2), spl-dkms, zfsutils-linux Conflicts: zfs, zfs-fuse, zfsutils-linux Suggests: nfs-kernel-server, samba-common-bin (>= 3.0.23), openzfs-zfs-initramfs | openzfs-zfs-dracut Provides: openzfsutils Description: command-line tools to manage OpenZFS filesystems OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, compression, encryption, snapshots, and more. . This package provides the zfs and zpool commands to create and administer OpenZFS filesystems. Package: openzfs-zfs-zed Section: contrib/admin Architecture: linux-any Pre-Depends: ${misc:Pre-Depends} Depends: openzfs-zfs-modules | openzfs-zfs-dkms, openzfs-zfsutils (>= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends} Conflicts: zfs, zfs-zed Replaces: zfs-zed Description: OpenZFS Event Daemon OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, compression, encryption, snapshots, and more. . ZED (ZFS Event Daemon) monitors events generated by the ZFS kernel module. When a zevent (ZFS Event) is posted, ZED will run any ZEDLETs (ZFS Event Daemon Linkage for Executable Tasks) that have been enabled for the corresponding zevent class. . This package provides the OpenZFS Event Daemon (zed). Package: openzfs-zfs-test Section: contrib/admin Architecture: linux-any Depends: acl, attr, bc, fio, ksh, lsscsi, mdadm, parted, python3, openzfs-python3-pyzfs, sudo, sysstat, openzfs-zfs-modules | openzfs-zfs-dkms, openzfs-zfsutils (>=${binary:Version}), ${misc:Depends}, ${shlibs:Depends} Recommends: nfs-kernel-server Breaks: zfsutils-linux (<= 0.7.9-2) Replaces: zfsutils-linux (<= 0.7.9-2), zfs-test Conflicts: zutils, zfs-test Description: OpenZFS test infrastructure and support scripts OpenZFS is a storage platform that encompasses the functionality of traditional filesystems and volume managers. It supports data checksums, compression, encryption, snapshots, and more. . This package provides the OpenZFS test infrastructure for destructively testing and validating a system using OpenZFS. It is entirely optional and should only be installed and used in test environments. diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install index fa05401bc168..741014398ade 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install @@ -1,134 +1,136 @@ etc/default/zfs etc/zfs/zfs-functions etc/zfs/zpool.d/ lib/systemd/system-generators/ lib/systemd/system-preset/ lib/systemd/system/zfs-import-cache.service lib/systemd/system/zfs-import-scan.service lib/systemd/system/zfs-import.target lib/systemd/system/zfs-load-key.service lib/systemd/system/zfs-mount.service lib/systemd/system/zfs-scrub-monthly@.timer lib/systemd/system/zfs-scrub-weekly@.timer lib/systemd/system/zfs-scrub@.service lib/systemd/system/zfs-trim-monthly@.timer lib/systemd/system/zfs-trim-weekly@.timer lib/systemd/system/zfs-trim@.service lib/systemd/system/zfs-share.service lib/systemd/system/zfs-volume-wait.service lib/systemd/system/zfs-volumes.target lib/systemd/system/zfs.target lib/udev/ sbin/fsck.zfs sbin/mount.zfs sbin/zdb sbin/zfs sbin/zfs_ids_to_path sbin/zgenhostid sbin/zhack sbin/zinject sbin/zpool sbin/zstream sbin/zstreamdump usr/bin/zvol_wait usr/lib/modules-load.d/ lib/ usr/lib/zfs-linux/zpool.d/ usr/lib/zfs-linux/zpool_influxdb +usr/lib/zfs-linux/zfs_prepare_disk usr/sbin/arc_summary usr/sbin/arcstat usr/sbin/dbufstat usr/sbin/zilstat usr/share/zfs/compatibility.d/ usr/share/bash-completion/completions usr/share/man/man1/arcstat.1 usr/share/man/man1/zhack.1 usr/share/man/man1/zvol_wait.1 usr/share/man/man5/ usr/share/man/man8/fsck.zfs.8 usr/share/man/man8/mount.zfs.8 usr/share/man/man8/vdev_id.8 usr/share/man/man8/zdb.8 usr/share/man/man8/zfs-allow.8 usr/share/man/man8/zfs-bookmark.8 usr/share/man/man8/zfs-change-key.8 usr/share/man/man8/zfs-clone.8 usr/share/man/man8/zfs-create.8 usr/share/man/man8/zfs-destroy.8 usr/share/man/man8/zfs-diff.8 usr/share/man/man8/zfs-get.8 usr/share/man/man8/zfs-groupspace.8 usr/share/man/man8/zfs-hold.8 usr/share/man/man8/zfs-inherit.8 usr/share/man/man8/zfs-list.8 usr/share/man/man8/zfs-load-key.8 usr/share/man/man8/zfs-mount-generator.8 usr/share/man/man8/zfs-mount.8 usr/share/man/man8/zfs-program.8 usr/share/man/man8/zfs-project.8 usr/share/man/man8/zfs-projectspace.8 usr/share/man/man8/zfs-promote.8 usr/share/man/man8/zfs-receive.8 usr/share/man/man8/zfs-recv.8 usr/share/man/man8/zfs-redact.8 usr/share/man/man8/zfs-release.8 usr/share/man/man8/zfs-rename.8 usr/share/man/man8/zfs-rollback.8 usr/share/man/man8/zfs-send.8 usr/share/man/man8/zfs-set.8 usr/share/man/man8/zfs-share.8 usr/share/man/man8/zfs-snapshot.8 usr/share/man/man8/zfs-unallow.8 usr/share/man/man8/zfs-unload-key.8 usr/share/man/man8/zfs-unmount.8 usr/share/man/man8/zfs-unzone.8 usr/share/man/man8/zfs-upgrade.8 usr/share/man/man8/zfs-userspace.8 usr/share/man/man8/zfs-wait.8 usr/share/man/man8/zfs-zone.8 usr/share/man/man8/zfs.8 usr/share/man/man8/zfs_ids_to_path.8 +usr/share/man/man8/zfs_prepare_disk.8 usr/share/man/man7/zfsconcepts.7 usr/share/man/man7/zfsprops.7 usr/share/man/man8/zgenhostid.8 usr/share/man/man8/zinject.8 usr/share/man/man8/zpool-add.8 usr/share/man/man8/zpool-attach.8 usr/share/man/man8/zpool-checkpoint.8 usr/share/man/man8/zpool-clear.8 usr/share/man/man8/zpool-create.8 usr/share/man/man8/zpool-destroy.8 usr/share/man/man8/zpool-detach.8 usr/share/man/man8/zpool-events.8 usr/share/man/man8/zpool-export.8 usr/share/man/man8/zpool-get.8 usr/share/man/man8/zpool-history.8 usr/share/man/man8/zpool-import.8 usr/share/man/man8/zpool-initialize.8 usr/share/man/man8/zpool-iostat.8 usr/share/man/man8/zpool-labelclear.8 usr/share/man/man8/zpool-list.8 usr/share/man/man8/zpool-offline.8 usr/share/man/man8/zpool-online.8 usr/share/man/man8/zpool-reguid.8 usr/share/man/man8/zpool-remove.8 usr/share/man/man8/zpool-reopen.8 usr/share/man/man8/zpool-replace.8 usr/share/man/man8/zpool-resilver.8 usr/share/man/man8/zpool-scrub.8 usr/share/man/man8/zpool-set.8 usr/share/man/man8/zpool-split.8 usr/share/man/man8/zpool-status.8 usr/share/man/man8/zpool-sync.8 usr/share/man/man8/zpool-trim.8 usr/share/man/man8/zpool-upgrade.8 usr/share/man/man8/zpool-wait.8 usr/share/man/man8/zpool.8 usr/share/man/man7/vdevprops.7 usr/share/man/man7/zpoolconcepts.7 usr/share/man/man7/zpoolprops.7 usr/share/man/man8/zstream.8 usr/share/man/man8/zstreamdump.8 usr/share/man/man4/spl.4 usr/share/man/man4/zfs.4 usr/share/man/man7/zpool-features.7 usr/share/man/man8/zpool_influxdb.8 diff --git a/sys/contrib/openzfs/include/libzfs.h b/sys/contrib/openzfs/include/libzfs.h index 6c3669273786..4adfa38e87be 100644 --- a/sys/contrib/openzfs/include/libzfs.h +++ b/sys/contrib/openzfs/include/libzfs.h @@ -1,1040 +1,1049 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright Joyent, Inc. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2016, Intel Corporation. * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2021, Colm Buckley */ #ifndef _LIBZFS_H #define _LIBZFS_H extern __attribute__((visibility("default"))) #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Miscellaneous ZFS constants */ #define ZFS_MAXPROPLEN MAXPATHLEN #define ZPOOL_MAXPROPLEN MAXPATHLEN /* * libzfs errors */ typedef enum zfs_error { EZFS_SUCCESS = 0, /* no error -- success */ EZFS_NOMEM = 2000, /* out of memory */ EZFS_BADPROP, /* invalid property value */ EZFS_PROPREADONLY, /* cannot set readonly property */ EZFS_PROPTYPE, /* property does not apply to dataset type */ EZFS_PROPNONINHERIT, /* property is not inheritable */ EZFS_PROPSPACE, /* bad quota or reservation */ EZFS_BADTYPE, /* dataset is not of appropriate type */ EZFS_BUSY, /* pool or dataset is busy */ EZFS_EXISTS, /* pool or dataset already exists */ EZFS_NOENT, /* no such pool or dataset */ EZFS_BADSTREAM, /* bad backup stream */ EZFS_DSREADONLY, /* dataset is readonly */ EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */ EZFS_INVALIDNAME, /* invalid dataset name */ EZFS_BADRESTORE, /* unable to restore to destination */ EZFS_BADBACKUP, /* backup failed */ EZFS_BADTARGET, /* bad attach/detach/replace target */ EZFS_NODEVICE, /* no such device in pool */ EZFS_BADDEV, /* invalid device to add */ EZFS_NOREPLICAS, /* no valid replicas */ EZFS_RESILVERING, /* resilvering (healing reconstruction) */ EZFS_BADVERSION, /* unsupported version */ EZFS_POOLUNAVAIL, /* pool is currently unavailable */ EZFS_DEVOVERFLOW, /* too many devices in one vdev */ EZFS_BADPATH, /* must be an absolute path */ EZFS_CROSSTARGET, /* rename or clone across pool or dataset */ EZFS_ZONED, /* used improperly in local zone */ EZFS_MOUNTFAILED, /* failed to mount dataset */ EZFS_UMOUNTFAILED, /* failed to unmount dataset */ EZFS_UNSHARENFSFAILED, /* failed to unshare over nfs */ EZFS_SHARENFSFAILED, /* failed to share over nfs */ EZFS_PERM, /* permission denied */ EZFS_NOSPC, /* out of space */ EZFS_FAULT, /* bad address */ EZFS_IO, /* I/O error */ EZFS_INTR, /* signal received */ EZFS_ISSPARE, /* device is a hot spare */ EZFS_INVALCONFIG, /* invalid vdev configuration */ EZFS_RECURSIVE, /* recursive dependency */ EZFS_NOHISTORY, /* no history object */ EZFS_POOLPROPS, /* couldn't retrieve pool props */ EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */ EZFS_POOL_INVALARG, /* invalid argument for this pool operation */ EZFS_NAMETOOLONG, /* dataset name is too long */ EZFS_OPENFAILED, /* open of device failed */ EZFS_NOCAP, /* couldn't get capacity */ EZFS_LABELFAILED, /* write of label failed */ EZFS_BADWHO, /* invalid permission who */ EZFS_BADPERM, /* invalid permission */ EZFS_BADPERMSET, /* invalid permission set name */ EZFS_NODELEGATION, /* delegated administration is disabled */ EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ EZFS_SHARESMBFAILED, /* failed to share over smb */ EZFS_BADCACHE, /* bad cache file */ EZFS_ISL2CACHE, /* device is for the level 2 ARC */ EZFS_VDEVNOTSUP, /* unsupported vdev type */ EZFS_NOTSUP, /* ops not supported on this dataset */ EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ EZFS_REFTAG_RELE, /* snapshot release: tag not found */ EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ EZFS_PIPEFAILED, /* pipe create failed */ EZFS_THREADCREATEFAILED, /* thread create failed */ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ EZFS_SCRUBBING, /* currently scrubbing */ EZFS_ERRORSCRUBBING, /* currently error scrubbing */ EZFS_ERRORSCRUB_PAUSED, /* error scrub currently paused */ EZFS_NO_SCRUB, /* no active scrub */ EZFS_DIFF, /* general failure of zfs diff */ EZFS_DIFFDATA, /* bad zfs diff data */ EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_SCRUB_PAUSED, /* scrub currently paused */ EZFS_SCRUB_PAUSED_TO_CANCEL, /* scrub currently paused */ EZFS_ACTIVE_POOL, /* pool is imported on a different system */ EZFS_CRYPTOFAILED, /* failed to setup encryption */ EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */ EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */ EZFS_NO_CHECKPOINT, /* pool has no checkpoint */ EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */ EZFS_VDEV_TOO_BIG, /* a device is too big to be used */ EZFS_IOC_NOTSUPPORTED, /* operation not supported by zfs module */ EZFS_TOOMANY, /* argument list too long */ EZFS_INITIALIZING, /* currently initializing */ EZFS_NO_INITIALIZE, /* no active initialize */ EZFS_WRONG_PARENT, /* invalid parent dataset (e.g ZVOL) */ EZFS_TRIMMING, /* currently trimming */ EZFS_NO_TRIM, /* no active trim */ EZFS_TRIM_NOTSUP, /* device does not support trim */ EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */ EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */ EZFS_REBUILDING, /* resilvering (sequential reconstrution) */ EZFS_VDEV_NOTSUP, /* ops not supported for this type of vdev */ EZFS_NOT_USER_NAMESPACE, /* a file is not a user namespace */ EZFS_CKSUM, /* insufficient replicas */ EZFS_RESUME_EXISTS, /* Resume on existing dataset without force */ EZFS_SHAREFAILED, /* filesystem share failed */ EZFS_UNKNOWN } zfs_error_t; /* * The following data structures are all part * of the zfs_allow_t data structure which is * used for printing 'allow' permissions. * It is a linked list of zfs_allow_t's which * then contain avl tree's for user/group/sets/... * and each one of the entries in those trees have * avl tree's for the permissions they belong to and * whether they are local,descendent or local+descendent * permissions. The AVL trees are used primarily for * sorting purposes, but also so that we can quickly find * a given user and or permission. */ typedef struct zfs_perm_node { avl_node_t z_node; char z_pname[MAXPATHLEN]; } zfs_perm_node_t; typedef struct zfs_allow_node { avl_node_t z_node; char z_key[MAXPATHLEN]; /* name, such as joe */ avl_tree_t z_localdescend; /* local+descendent perms */ avl_tree_t z_local; /* local permissions */ avl_tree_t z_descend; /* descendent permissions */ } zfs_allow_node_t; typedef struct zfs_allow { struct zfs_allow *z_next; char z_setpoint[MAXPATHLEN]; avl_tree_t z_sets; avl_tree_t z_crperms; avl_tree_t z_user; avl_tree_t z_group; avl_tree_t z_everyone; } zfs_allow_t; /* * Basic handle types */ typedef struct zfs_handle zfs_handle_t; typedef struct zpool_handle zpool_handle_t; typedef struct libzfs_handle libzfs_handle_t; _LIBZFS_H int zpool_wait(zpool_handle_t *, zpool_wait_activity_t); _LIBZFS_H int zpool_wait_status(zpool_handle_t *, zpool_wait_activity_t, boolean_t *, boolean_t *); /* * Library initialization */ _LIBZFS_H libzfs_handle_t *libzfs_init(void); _LIBZFS_H void libzfs_fini(libzfs_handle_t *); _LIBZFS_H libzfs_handle_t *zpool_get_handle(zpool_handle_t *); _LIBZFS_H libzfs_handle_t *zfs_get_handle(zfs_handle_t *); _LIBZFS_H void libzfs_print_on_error(libzfs_handle_t *, boolean_t); _LIBZFS_H void zfs_save_arguments(int argc, char **, char *, int); _LIBZFS_H int zpool_log_history(libzfs_handle_t *, const char *); _LIBZFS_H int libzfs_errno(libzfs_handle_t *); _LIBZFS_H const char *libzfs_error_init(int); _LIBZFS_H const char *libzfs_error_action(libzfs_handle_t *); _LIBZFS_H const char *libzfs_error_description(libzfs_handle_t *); _LIBZFS_H int zfs_standard_error(libzfs_handle_t *, int, const char *); _LIBZFS_H void libzfs_mnttab_init(libzfs_handle_t *); _LIBZFS_H void libzfs_mnttab_fini(libzfs_handle_t *); _LIBZFS_H void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); _LIBZFS_H int libzfs_mnttab_find(libzfs_handle_t *, const char *, struct mnttab *); _LIBZFS_H void libzfs_mnttab_add(libzfs_handle_t *, const char *, const char *, const char *); _LIBZFS_H void libzfs_mnttab_remove(libzfs_handle_t *, const char *); /* * Basic handle functions */ _LIBZFS_H zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); _LIBZFS_H zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); _LIBZFS_H void zpool_close(zpool_handle_t *); _LIBZFS_H const char *zpool_get_name(zpool_handle_t *); _LIBZFS_H int zpool_get_state(zpool_handle_t *); _LIBZFS_H const char *zpool_state_to_name(vdev_state_t, vdev_aux_t); _LIBZFS_H const char *zpool_pool_state_to_name(pool_state_t); _LIBZFS_H void zpool_free_handles(libzfs_handle_t *); /* * Iterate over all active pools in the system. */ typedef int (*zpool_iter_f)(zpool_handle_t *, void *); _LIBZFS_H int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); _LIBZFS_H boolean_t zpool_skip_pool(const char *); /* * Functions to create and destroy pools */ _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *); _LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *); typedef struct splitflags { /* do not split, but return the config that would be split off */ unsigned int dryrun : 1; /* after splitting, import the pool */ unsigned int import : 1; int name_flags; } splitflags_t; typedef struct trimflags { /* requested vdevs are for the entire pool */ boolean_t fullpool; /* request a secure trim, requires support from device */ boolean_t secure; /* after starting trim, block until trim completes */ boolean_t wait; /* trim at the requested rate in bytes/second */ uint64_t rate; } trimflags_t; /* * Functions to manipulate pool and vdev state */ _LIBZFS_H int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); _LIBZFS_H int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); _LIBZFS_H int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *, trimflags_t *); _LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); _LIBZFS_H int zpool_reguid(zpool_handle_t *); _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); _LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); _LIBZFS_H int zpool_vdev_attach(zpool_handle_t *, const char *, const char *, nvlist_t *, int, boolean_t); _LIBZFS_H int zpool_vdev_detach(zpool_handle_t *, const char *); _LIBZFS_H int zpool_vdev_remove(zpool_handle_t *, const char *); _LIBZFS_H int zpool_vdev_remove_cancel(zpool_handle_t *); _LIBZFS_H int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *); _LIBZFS_H int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, splitflags_t); _LIBZFS_H int zpool_vdev_remove_wanted(zpool_handle_t *, const char *); _LIBZFS_H int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); _LIBZFS_H int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); _LIBZFS_H int zpool_vdev_clear(zpool_handle_t *, uint64_t); _LIBZFS_H nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); _LIBZFS_H nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); _LIBZFS_H int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *); +_LIBZFS_H int zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv, + const char *prepare_str, char **lines[], int *lines_cnt); +_LIBZFS_H int zpool_prepare_and_label_disk(libzfs_handle_t *hdl, + zpool_handle_t *, const char *, nvlist_t *vdev_nv, const char *prepare_str, + char **lines[], int *lines_cnt); +_LIBZFS_H char ** zpool_vdev_script_alloc_env(const char *pool_name, + const char *vdev_path, const char *vdev_upath, + const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val); +_LIBZFS_H void zpool_vdev_script_free_env(char **env); _LIBZFS_H uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path); _LIBZFS_H const char *zpool_get_state_str(zpool_handle_t *); /* * Functions to manage pool properties */ _LIBZFS_H int zpool_set_prop(zpool_handle_t *, const char *, const char *); _LIBZFS_H int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, size_t proplen, zprop_source_t *, boolean_t literal); _LIBZFS_H int zpool_get_userprop(zpool_handle_t *, const char *, char *, size_t proplen, zprop_source_t *); _LIBZFS_H uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, zprop_source_t *); _LIBZFS_H int zpool_props_refresh(zpool_handle_t *); _LIBZFS_H const char *zpool_prop_to_name(zpool_prop_t); _LIBZFS_H const char *zpool_prop_values(zpool_prop_t); /* * Functions to manage vdev properties */ _LIBZFS_H int zpool_get_vdev_prop_value(nvlist_t *, vdev_prop_t, char *, char *, size_t, zprop_source_t *, boolean_t); _LIBZFS_H int zpool_get_vdev_prop(zpool_handle_t *, const char *, vdev_prop_t, char *, char *, size_t, zprop_source_t *, boolean_t); _LIBZFS_H int zpool_get_all_vdev_props(zpool_handle_t *, const char *, nvlist_t **); _LIBZFS_H int zpool_set_vdev_prop(zpool_handle_t *, const char *, const char *, const char *); _LIBZFS_H const char *vdev_prop_to_name(vdev_prop_t); _LIBZFS_H const char *vdev_prop_values(vdev_prop_t); _LIBZFS_H boolean_t vdev_prop_user(const char *name); _LIBZFS_H const char *vdev_prop_column_name(vdev_prop_t); _LIBZFS_H boolean_t vdev_prop_align_right(vdev_prop_t); /* * Pool health statistics. */ typedef enum { /* * The following correspond to faults as defined in the (fault.fs.zfs.*) * event namespace. Each is associated with a corresponding message ID. * This must be kept in sync with the zfs_msgid_table in * lib/libzfs/libzfs_status.c. */ ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */ ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */ ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */ ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */ ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */ ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */ ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */ ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */ ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */ ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */ ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ ZPOOL_STATUS_HOSTID_ACTIVE, /* currently active on another system */ ZPOOL_STATUS_HOSTID_REQUIRED, /* multihost=on and hostid=0 */ ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ ZPOOL_STATUS_IO_FAILURE_MMP, /* failed MMP, failmode not 'panic' */ ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ ZPOOL_STATUS_ERRATA, /* informational errata available */ /* * If the pool has unsupported features but can still be opened in * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the * pool has unsupported features but cannot be opened at all, its * status is ZPOOL_STATUS_UNSUP_FEAT_READ. */ ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */ ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */ /* * These faults have no corresponding message ID. At the time we are * checking the status, the original reason for the FMA fault (I/O or * checksum errors) has been lost. */ ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ /* * The following are not faults per se, but still an error possibly * requiring administrative attention. There is no corresponding * message ID. */ ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */ ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */ ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ ZPOOL_STATUS_REBUILDING, /* device being rebuilt */ ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */ ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ ZPOOL_STATUS_COMPATIBILITY_ERR, /* bad 'compatibility' property */ ZPOOL_STATUS_INCOMPATIBLE_FEAT, /* feature set outside compatibility */ /* * Finally, the following indicates a healthy pool. */ ZPOOL_STATUS_OK } zpool_status_t; _LIBZFS_H zpool_status_t zpool_get_status(zpool_handle_t *, const char **, zpool_errata_t *); _LIBZFS_H zpool_status_t zpool_import_status(nvlist_t *, const char **, zpool_errata_t *); /* * Statistics and configuration functions. */ _LIBZFS_H nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); _LIBZFS_H nvlist_t *zpool_get_features(zpool_handle_t *); _LIBZFS_H int zpool_refresh_stats(zpool_handle_t *, boolean_t *); _LIBZFS_H int zpool_get_errlog(zpool_handle_t *, nvlist_t **); /* * Import and export functions */ _LIBZFS_H int zpool_export(zpool_handle_t *, boolean_t, const char *); _LIBZFS_H int zpool_export_force(zpool_handle_t *, const char *); _LIBZFS_H int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); _LIBZFS_H int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, nvlist_t *, int); _LIBZFS_H void zpool_print_unsup_feat(nvlist_t *config); /* * Miscellaneous pool functions */ struct zfs_cmd; _LIBZFS_H const char *const zfs_history_event_names[]; typedef enum { VDEV_NAME_PATH = 1 << 0, VDEV_NAME_GUID = 1 << 1, VDEV_NAME_FOLLOW_LINKS = 1 << 2, VDEV_NAME_TYPE_ID = 1 << 3, } vdev_name_t; _LIBZFS_H char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, int name_flags); _LIBZFS_H int zpool_upgrade(zpool_handle_t *, uint64_t); _LIBZFS_H int zpool_get_history(zpool_handle_t *, nvlist_t **, uint64_t *, boolean_t *); _LIBZFS_H int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, unsigned, int); _LIBZFS_H int zpool_events_clear(libzfs_handle_t *, int *); _LIBZFS_H int zpool_events_seek(libzfs_handle_t *, uint64_t, int); _LIBZFS_H void zpool_obj_to_path_ds(zpool_handle_t *, uint64_t, uint64_t, char *, size_t); _LIBZFS_H void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, size_t); _LIBZFS_H int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); _LIBZFS_H void zpool_explain_recover(libzfs_handle_t *, const char *, int, nvlist_t *); _LIBZFS_H int zpool_checkpoint(zpool_handle_t *); _LIBZFS_H int zpool_discard_checkpoint(zpool_handle_t *); _LIBZFS_H boolean_t zpool_is_draid_spare(const char *); /* * Basic handle manipulations. These functions do not create or destroy the * underlying datasets, only the references to them. */ _LIBZFS_H zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); _LIBZFS_H zfs_handle_t *zfs_handle_dup(zfs_handle_t *); _LIBZFS_H void zfs_close(zfs_handle_t *); _LIBZFS_H zfs_type_t zfs_get_type(const zfs_handle_t *); _LIBZFS_H zfs_type_t zfs_get_underlying_type(const zfs_handle_t *); _LIBZFS_H const char *zfs_get_name(const zfs_handle_t *); _LIBZFS_H zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); _LIBZFS_H const char *zfs_get_pool_name(const zfs_handle_t *); /* * Property management functions. Some functions are shared with the kernel, * and are found in sys/fs/zfs.h. */ /* * zfs dataset property management */ _LIBZFS_H const char *zfs_prop_default_string(zfs_prop_t); _LIBZFS_H uint64_t zfs_prop_default_numeric(zfs_prop_t); _LIBZFS_H const char *zfs_prop_column_name(zfs_prop_t); _LIBZFS_H boolean_t zfs_prop_align_right(zfs_prop_t); _LIBZFS_H nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, nvlist_t *, uint64_t, zfs_handle_t *, zpool_handle_t *, boolean_t, const char *); _LIBZFS_H const char *zfs_prop_to_name(zfs_prop_t); _LIBZFS_H int zfs_prop_set(zfs_handle_t *, const char *, const char *); _LIBZFS_H int zfs_prop_set_list(zfs_handle_t *, nvlist_t *); _LIBZFS_H int zfs_prop_set_list_flags(zfs_handle_t *, nvlist_t *, int); _LIBZFS_H int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zprop_source_t *, char *, size_t, boolean_t); _LIBZFS_H int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, boolean_t); _LIBZFS_H int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, zprop_source_t *, char *, size_t); _LIBZFS_H int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); _LIBZFS_H int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); _LIBZFS_H int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); _LIBZFS_H int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); _LIBZFS_H int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, char *buf, size_t len); _LIBZFS_H uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, const char **); _LIBZFS_H uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); _LIBZFS_H int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); _LIBZFS_H const char *zfs_prop_values(zfs_prop_t); _LIBZFS_H int zfs_prop_is_string(zfs_prop_t prop); _LIBZFS_H nvlist_t *zfs_get_all_props(zfs_handle_t *); _LIBZFS_H nvlist_t *zfs_get_user_props(zfs_handle_t *); _LIBZFS_H nvlist_t *zfs_get_recvd_props(zfs_handle_t *); _LIBZFS_H nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); _LIBZFS_H int zfs_wait_status(zfs_handle_t *, zfs_wait_activity_t, boolean_t *, boolean_t *); /* * zfs encryption management */ _LIBZFS_H int zfs_crypto_get_encryption_root(zfs_handle_t *, boolean_t *, char *); _LIBZFS_H int zfs_crypto_create(libzfs_handle_t *, char *, nvlist_t *, nvlist_t *, boolean_t stdin_available, uint8_t **, uint_t *); _LIBZFS_H int zfs_crypto_clone_check(libzfs_handle_t *, zfs_handle_t *, char *, nvlist_t *); _LIBZFS_H int zfs_crypto_attempt_load_keys(libzfs_handle_t *, const char *); _LIBZFS_H int zfs_crypto_load_key(zfs_handle_t *, boolean_t, const char *); _LIBZFS_H int zfs_crypto_unload_key(zfs_handle_t *); _LIBZFS_H int zfs_crypto_rewrap(zfs_handle_t *, nvlist_t *, boolean_t); typedef struct zprop_list { int pl_prop; char *pl_user_prop; struct zprop_list *pl_next; boolean_t pl_all; size_t pl_width; size_t pl_recvd_width; boolean_t pl_fixed; } zprop_list_t; _LIBZFS_H int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, boolean_t); _LIBZFS_H void zfs_prune_proplist(zfs_handle_t *, uint8_t *); _LIBZFS_H int vdev_expand_proplist(zpool_handle_t *, const char *, zprop_list_t **); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" #define ZFS_FEATURE_DISABLED "disabled" #define ZFS_FEATURE_ENABLED "enabled" #define ZFS_FEATURE_ACTIVE "active" #define ZFS_UNSUPPORTED_INACTIVE "inactive" #define ZFS_UNSUPPORTED_READONLY "readonly" /* * zpool property management */ _LIBZFS_H int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **, zfs_type_t, boolean_t); _LIBZFS_H int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, size_t); _LIBZFS_H const char *zpool_prop_default_string(zpool_prop_t); _LIBZFS_H uint64_t zpool_prop_default_numeric(zpool_prop_t); _LIBZFS_H const char *zpool_prop_column_name(zpool_prop_t); _LIBZFS_H boolean_t zpool_prop_align_right(zpool_prop_t); /* * Functions shared by zfs and zpool property management. */ _LIBZFS_H int zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type); _LIBZFS_H int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, zfs_type_t); _LIBZFS_H void zprop_free_list(zprop_list_t *); #define ZFS_GET_NCOLS 5 typedef enum { GET_COL_NONE, GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_RECVD, GET_COL_SOURCE } zfs_get_column_t; /* * Functions for printing zfs or zpool properties */ typedef struct vdev_cbdata { int cb_name_flags; char **cb_names; unsigned int cb_names_count; } vdev_cbdata_t; typedef struct zprop_get_cbdata { int cb_sources; zfs_get_column_t cb_columns[ZFS_GET_NCOLS]; int cb_colwidths[ZFS_GET_NCOLS + 1]; boolean_t cb_scripted; boolean_t cb_literal; boolean_t cb_first; zprop_list_t *cb_proplist; zfs_type_t cb_type; vdev_cbdata_t cb_vdevs; } zprop_get_cbdata_t; #define ZFS_SET_NOMOUNT 1 typedef struct zprop_set_cbdata { int cb_flags; nvlist_t *cb_proplist; } zprop_set_cbdata_t; _LIBZFS_H void zprop_print_one_property(const char *, zprop_get_cbdata_t *, const char *, const char *, zprop_source_t, const char *, const char *); /* * Iterator functions. */ #define ZFS_ITER_RECURSE (1 << 0) #define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1) #define ZFS_ITER_PROP_LISTSNAPS (1 << 2) #define ZFS_ITER_DEPTH_LIMIT (1 << 3) #define ZFS_ITER_RECVD_PROPS (1 << 4) #define ZFS_ITER_LITERAL_PROPS (1 << 5) #define ZFS_ITER_SIMPLE (1 << 6) typedef int (*zfs_iter_f)(zfs_handle_t *, void *); _LIBZFS_H int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *, uint64_t, uint64_t); _LIBZFS_H int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *, uint64_t, uint64_t); _LIBZFS_H int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_children_v2(zfs_handle_t *, int, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_dependents_v2(zfs_handle_t *, int, boolean_t, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_filesystems_v2(zfs_handle_t *, int, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_snapshots_v2(zfs_handle_t *, int, zfs_iter_f, void *, uint64_t, uint64_t); _LIBZFS_H int zfs_iter_snapshots_sorted_v2(zfs_handle_t *, int, zfs_iter_f, void *, uint64_t, uint64_t); _LIBZFS_H int zfs_iter_snapspec_v2(zfs_handle_t *, int, const char *, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_bookmarks_v2(zfs_handle_t *, int, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_mounted(zfs_handle_t *, zfs_iter_f, void *); typedef struct get_all_cb { zfs_handle_t **cb_handles; size_t cb_alloc; size_t cb_used; } get_all_cb_t; _LIBZFS_H void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t, zfs_iter_f, void *, boolean_t); _LIBZFS_H void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); /* * Functions to create and destroy datasets. */ _LIBZFS_H int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, nvlist_t *); _LIBZFS_H int zfs_create_ancestors(libzfs_handle_t *, const char *); _LIBZFS_H int zfs_destroy(zfs_handle_t *, boolean_t); _LIBZFS_H int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); _LIBZFS_H int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); _LIBZFS_H int zfs_destroy_snaps_nvl_os(libzfs_handle_t *, nvlist_t *); _LIBZFS_H int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); _LIBZFS_H int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); _LIBZFS_H int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props); _LIBZFS_H int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); typedef struct renameflags { /* recursive rename */ unsigned int recursive : 1; /* don't unmount file systems */ unsigned int nounmount : 1; /* force unmount file systems */ unsigned int forceunmount : 1; } renameflags_t; _LIBZFS_H int zfs_rename(zfs_handle_t *, const char *, renameflags_t); typedef struct sendflags { /* Amount of extra information to print. */ int verbosity; /* recursive send (ie, -R) */ boolean_t replicate; /* for recursive send, skip sending missing snapshots */ boolean_t skipmissing; /* for incrementals, do all intermediate snapshots */ boolean_t doall; /* if dataset is a clone, do incremental from its origin */ boolean_t fromorigin; /* field no longer used, maintained for backwards compatibility */ boolean_t pad; /* send properties (ie, -p) */ boolean_t props; /* do not send (no-op, ie. -n) */ boolean_t dryrun; /* parsable verbose output (ie. -P) */ boolean_t parsable; /* show progress (ie. -v) */ boolean_t progress; /* show progress as process title (ie. -V) */ boolean_t progressastitle; /* large blocks (>128K) are permitted */ boolean_t largeblock; /* WRITE_EMBEDDED records of type DATA are permitted */ boolean_t embed_data; /* compressed WRITE records are permitted */ boolean_t compress; /* raw encrypted records are permitted */ boolean_t raw; /* only send received properties (ie. -b) */ boolean_t backup; /* include snapshot holds in send stream */ boolean_t holds; /* stream represents a partially received dataset */ boolean_t saved; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); _LIBZFS_H int zfs_send(zfs_handle_t *, const char *, const char *, sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); _LIBZFS_H int zfs_send_one(zfs_handle_t *, const char *, int, sendflags_t *, const char *); _LIBZFS_H int zfs_send_progress(zfs_handle_t *, int, uint64_t *, uint64_t *); _LIBZFS_H int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, const char *); _LIBZFS_H int zfs_send_saved(zfs_handle_t *, sendflags_t *, int, const char *); _LIBZFS_H nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token); _LIBZFS_H int zfs_promote(zfs_handle_t *); _LIBZFS_H int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, int); _LIBZFS_H int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); _LIBZFS_H int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); _LIBZFS_H int zfs_get_holds(zfs_handle_t *, nvlist_t **); _LIBZFS_H uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t, nvlist_t *); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, uid_t rid, uint64_t space); _LIBZFS_H int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, zfs_userspace_cb_t, void *); _LIBZFS_H int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); _LIBZFS_H int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); typedef struct recvflags { /* print informational messages (ie, -v was specified) */ boolean_t verbose; /* the destination is a prefix, not the exact fs (ie, -d) */ boolean_t isprefix; /* * Only the tail of the sent snapshot path is appended to the * destination to determine the received snapshot name (ie, -e). */ boolean_t istail; /* do not actually do the recv, just check if it would work (ie, -n) */ boolean_t dryrun; /* rollback/destroy filesystems as necessary (eg, -F) */ boolean_t force; /* set "canmount=off" on all modified filesystems */ boolean_t canmountoff; /* * Mark the file systems as "resumable" and do not destroy them if the * receive is interrupted */ boolean_t resumable; /* byteswap flag is used internally; callers need not specify */ boolean_t byteswap; /* do not mount file systems as they are extracted (private) */ boolean_t nomount; /* Was holds flag set in the compound header? */ boolean_t holds; /* skip receive of snapshot holds */ boolean_t skipholds; /* mount the filesystem unless nomount is specified */ boolean_t domount; /* force unmount while recv snapshot (private) */ boolean_t forceunmount; /* use this recv to check (and heal if needed) an existing snapshot */ boolean_t heal; } recvflags_t; _LIBZFS_H int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, recvflags_t *, int, avl_tree_t *); typedef enum diff_flags { ZFS_DIFF_PARSEABLE = 1 << 0, ZFS_DIFF_TIMESTAMP = 1 << 1, ZFS_DIFF_CLASSIFY = 1 << 2, ZFS_DIFF_NO_MANGLE = 1 << 3 } diff_flags_t; _LIBZFS_H int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, int); /* * Miscellaneous functions. */ _LIBZFS_H const char *zfs_type_to_name(zfs_type_t); _LIBZFS_H void zfs_refresh_properties(zfs_handle_t *); _LIBZFS_H int zfs_name_valid(const char *, zfs_type_t); _LIBZFS_H zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, const char *, zfs_type_t); _LIBZFS_H int zfs_parent_name(zfs_handle_t *, char *, size_t); _LIBZFS_H boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, zfs_type_t); _LIBZFS_H int zfs_spa_version(zfs_handle_t *, int *); _LIBZFS_H boolean_t zfs_bookmark_exists(const char *path); /* * Mount support functions. */ _LIBZFS_H boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); _LIBZFS_H boolean_t zfs_is_mounted(zfs_handle_t *, char **); _LIBZFS_H int zfs_mount(zfs_handle_t *, const char *, int); _LIBZFS_H int zfs_mount_at(zfs_handle_t *, const char *, int, const char *); _LIBZFS_H int zfs_unmount(zfs_handle_t *, const char *, int); _LIBZFS_H int zfs_unmountall(zfs_handle_t *, int); _LIBZFS_H int zfs_mount_delegation_check(void); #if defined(__linux__) || defined(__APPLE__) _LIBZFS_H int zfs_parse_mount_options(const char *mntopts, unsigned long *mntflags, unsigned long *zfsflags, int sloppy, char *badopt, char *mtabopt); _LIBZFS_H void zfs_adjust_mount_options(zfs_handle_t *zhp, const char *mntpoint, char *mntopts, char *mtabopt); #endif /* * Share support functions. * * enum sa_protocol * lists are terminated with SA_NO_PROTOCOL, * NULL means "all/any known to this libzfs". */ #define SA_NO_PROTOCOL -1 _LIBZFS_H boolean_t zfs_is_shared(zfs_handle_t *zhp, char **where, const enum sa_protocol *proto); _LIBZFS_H int zfs_share(zfs_handle_t *zhp, const enum sa_protocol *proto); _LIBZFS_H int zfs_unshare(zfs_handle_t *zhp, const char *mountpoint, const enum sa_protocol *proto); _LIBZFS_H int zfs_unshareall(zfs_handle_t *zhp, const enum sa_protocol *proto); _LIBZFS_H void zfs_commit_shares(const enum sa_protocol *proto); _LIBZFS_H void zfs_truncate_shares(const enum sa_protocol *proto); _LIBZFS_H int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); /* * Utility functions to run an external process. */ #define STDOUT_VERBOSE 0x01 #define STDERR_VERBOSE 0x02 #define NO_DEFAULT_PATH 0x04 /* Don't use $PATH to lookup the command */ _LIBZFS_H int libzfs_run_process(const char *, char **, int); _LIBZFS_H int libzfs_run_process_get_stdout(const char *, char *[], char *[], char **[], int *); _LIBZFS_H int libzfs_run_process_get_stdout_nopath(const char *, char *[], char *[], char **[], int *); _LIBZFS_H void libzfs_free_str_array(char **, int); _LIBZFS_H boolean_t libzfs_envvar_is_set(const char *); /* * Utility functions for zfs version */ _LIBZFS_H const char *zfs_version_userland(void); _LIBZFS_H char *zfs_version_kernel(void); _LIBZFS_H int zfs_version_print(void); /* * Given a device or file, determine if it is part of a pool. */ _LIBZFS_H int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, boolean_t *); /* * Label manipulation. */ _LIBZFS_H int zpool_clear_label(int); _LIBZFS_H int zpool_set_bootenv(zpool_handle_t *, const nvlist_t *); _LIBZFS_H int zpool_get_bootenv(zpool_handle_t *, nvlist_t **); /* * Management interfaces for SMB ACL files */ _LIBZFS_H int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); _LIBZFS_H int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); _LIBZFS_H int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); _LIBZFS_H int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); /* * Enable and disable datasets within a pool by mounting/unmounting and * sharing/unsharing them. */ _LIBZFS_H int zpool_enable_datasets(zpool_handle_t *, const char *, int); _LIBZFS_H int zpool_disable_datasets(zpool_handle_t *, boolean_t); _LIBZFS_H void zpool_disable_datasets_os(zpool_handle_t *, boolean_t); _LIBZFS_H void zpool_disable_volume_os(const char *); /* * Parse a features file for -o compatibility */ typedef enum { ZPOOL_COMPATIBILITY_OK, ZPOOL_COMPATIBILITY_WARNTOKEN, ZPOOL_COMPATIBILITY_BADTOKEN, ZPOOL_COMPATIBILITY_BADFILE, ZPOOL_COMPATIBILITY_NOFILES } zpool_compat_status_t; _LIBZFS_H zpool_compat_status_t zpool_load_compat(const char *, boolean_t *, char *, size_t); #ifdef __FreeBSD__ /* * Attach/detach the given filesystem to/from the given jail. */ _LIBZFS_H int zfs_jail(zfs_handle_t *zhp, int jailid, int attach); /* * Set loader options for next boot. */ _LIBZFS_H int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, const char *); #endif /* __FreeBSD__ */ #ifdef __linux__ /* * Add or delete the given filesystem to/from the given user namespace. */ _LIBZFS_H int zfs_userns(zfs_handle_t *zhp, const char *nspath, int attach); #endif #ifdef __cplusplus } #endif #endif /* _LIBZFS_H */ diff --git a/sys/contrib/openzfs/include/libzutil.h b/sys/contrib/openzfs/include/libzutil.h index 237ff976ba62..053b1ed4b52a 100644 --- a/sys/contrib/openzfs/include/libzutil.h +++ b/sys/contrib/openzfs/include/libzutil.h @@ -1,215 +1,215 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018 by Delphix. All rights reserved. */ #ifndef _LIBZUTIL_H #define _LIBZUTIL_H extern __attribute__((visibility("default"))) #include #include #ifdef __cplusplus extern "C" { #endif /* - * Default wait time for a device name to be created. + * Default wait time in milliseconds for a device name to be created. */ #define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */ /* * Pool Config Operations * * These are specific to the library libzfs or libzpool instance. */ typedef nvlist_t *refresh_config_func_t(void *, nvlist_t *); typedef int pool_active_func_t(void *, const char *, uint64_t, boolean_t *); typedef const struct pool_config_ops { refresh_config_func_t *pco_refresh_config; pool_active_func_t *pco_pool_active; } pool_config_ops_t; /* * An instance of pool_config_ops_t is expected in the caller's binary. */ _LIBZUTIL_H pool_config_ops_t libzfs_config_ops; _LIBZUTIL_H pool_config_ops_t libzpool_config_ops; typedef enum lpc_error { LPC_SUCCESS = 0, /* no error -- success */ LPC_BADCACHE = 2000, /* out of memory */ LPC_BADPATH, /* must be an absolute path */ LPC_NOMEM, /* out of memory */ LPC_EACCESS, /* some devices require root privileges */ LPC_UNKNOWN } lpc_error_t; typedef struct importargs { char **path; /* a list of paths to search */ int paths; /* number of paths to search */ const char *poolname; /* name of a pool to find */ uint64_t guid; /* guid of a pool to find */ const char *cachefile; /* cachefile to use for import */ boolean_t can_be_active; /* can the pool be active? */ boolean_t scan; /* prefer scanning to libblkid cache */ nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ } importargs_t; typedef struct libpc_handle { int lpc_error; boolean_t lpc_printerr; boolean_t lpc_open_access_error; boolean_t lpc_desc_active; char lpc_desc[1024]; pool_config_ops_t *lpc_ops; void *lpc_lib_handle; } libpc_handle_t; _LIBZUTIL_H const char *libpc_error_description(libpc_handle_t *); _LIBZUTIL_H nvlist_t *zpool_search_import(libpc_handle_t *, importargs_t *); _LIBZUTIL_H int zpool_find_config(libpc_handle_t *, const char *, nvlist_t **, importargs_t *); _LIBZUTIL_H const char * const * zpool_default_search_paths(size_t *count); _LIBZUTIL_H int zpool_read_label(int, nvlist_t **, int *); _LIBZUTIL_H int zpool_label_disk_wait(const char *, int); struct udev_device; _LIBZUTIL_H int zfs_device_get_devid(struct udev_device *, char *, size_t); _LIBZUTIL_H int zfs_device_get_physical(struct udev_device *, char *, size_t); _LIBZUTIL_H void update_vdev_config_dev_strs(nvlist_t *); /* * Default device paths */ #define DISK_ROOT "/dev" #define UDISK_ROOT "/dev/disk" #define ZVOL_ROOT "/dev/zvol" _LIBZUTIL_H int zfs_append_partition(char *path, size_t max_len); _LIBZUTIL_H int zfs_resolve_shortname(const char *name, char *path, size_t pathlen); _LIBZUTIL_H char *zfs_strip_partition(const char *); _LIBZUTIL_H const char *zfs_strip_path(const char *); _LIBZUTIL_H int zfs_strcmp_pathname(const char *, const char *, int); _LIBZUTIL_H boolean_t zfs_dev_is_dm(const char *); _LIBZUTIL_H boolean_t zfs_dev_is_whole_disk(const char *); _LIBZUTIL_H int zfs_dev_flush(int); _LIBZUTIL_H char *zfs_get_underlying_path(const char *); _LIBZUTIL_H char *zfs_get_enclosure_sysfs_path(const char *); _LIBZUTIL_H boolean_t is_mpath_whole_disk(const char *); _LIBZUTIL_H boolean_t zfs_isnumber(const char *); /* * Formats for iostat numbers. Examples: "12K", "30ms", "4B", "2321234", "-". * * ZFS_NICENUM_1024: Print kilo, mega, tera, peta, exa.. * ZFS_NICENUM_BYTES: Print single bytes ("13B"), kilo, mega, tera... * ZFS_NICENUM_TIME: Print nanosecs, microsecs, millisecs, seconds... * ZFS_NICENUM_RAW: Print the raw number without any formatting * ZFS_NICENUM_RAWTIME: Same as RAW, but print dashes ('-') for zero. */ enum zfs_nicenum_format { ZFS_NICENUM_1024 = 0, ZFS_NICENUM_BYTES = 1, ZFS_NICENUM_TIME = 2, ZFS_NICENUM_RAW = 3, ZFS_NICENUM_RAWTIME = 4 }; /* * Convert a number to a human-readable form. */ _LIBZUTIL_H void zfs_nicebytes(uint64_t, char *, size_t); _LIBZUTIL_H void zfs_nicenum(uint64_t, char *, size_t); _LIBZUTIL_H void zfs_nicenum_format(uint64_t, char *, size_t, enum zfs_nicenum_format); _LIBZUTIL_H void zfs_nicetime(uint64_t, char *, size_t); _LIBZUTIL_H void zfs_niceraw(uint64_t, char *, size_t); #define nicenum(num, buf, size) zfs_nicenum(num, buf, size) _LIBZUTIL_H void zpool_dump_ddt(const ddt_stat_t *, const ddt_histogram_t *); _LIBZUTIL_H int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, uint_t *); struct zfs_cmd; /* * List of colors to use */ #define ANSI_BLACK "\033[0;30m" #define ANSI_RED "\033[0;31m" #define ANSI_GREEN "\033[0;32m" #define ANSI_YELLOW "\033[0;33m" #define ANSI_BLUE "\033[0;34m" #define ANSI_BOLD_BLUE "\033[1;34m" /* light blue */ #define ANSI_MAGENTA "\033[0;35m" #define ANSI_CYAN "\033[0;36m" #define ANSI_GRAY "\033[0;37m" #define ANSI_RESET "\033[0m" #define ANSI_BOLD "\033[1m" _LIBZUTIL_H int use_color(void); _LIBZUTIL_H void color_start(const char *color); _LIBZUTIL_H void color_end(void); _LIBZUTIL_H int printf_color(const char *color, const char *format, ...); _LIBZUTIL_H const char *zfs_basename(const char *path); _LIBZUTIL_H ssize_t zfs_dirnamelen(const char *path); #ifdef __linux__ extern char **environ; _LIBZUTIL_H void zfs_setproctitle_init(int argc, char *argv[], char *envp[]); _LIBZUTIL_H void zfs_setproctitle(const char *fmt, ...); #else #define zfs_setproctitle(fmt, ...) setproctitle(fmt, ##__VA_ARGS__) #define zfs_setproctitle_init(x, y, z) ((void)0) #endif /* * These functions are used by the ZFS libraries and cmd/zpool code, but are * not exported in the ABI. */ typedef int (*pool_vdev_iter_f)(void *, nvlist_t *, void *); int for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, void *data); int for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data); void update_vdevs_config_dev_sysfs_path(nvlist_t *config); #ifdef __cplusplus } #endif #endif /* _LIBZUTIL_H */ diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h index e757d12c1502..8cfe56c75309 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mutex.h @@ -1,72 +1,73 @@ /* * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_MUTEX_H_ #define _OPENSOLARIS_SYS_MUTEX_H_ typedef struct sx kmutex_t; #include #include #include_next #include_next #include #include typedef enum { MUTEX_DEFAULT = 0 /* kernel default mutex */ } kmutex_type_t; #define MUTEX_HELD(x) (mutex_owned(x)) #define MUTEX_NOT_HELD(x) (!mutex_owned(x) || panicstr) #ifndef OPENSOLARIS_WITNESS #define MUTEX_FLAGS (SX_DUPOK | SX_NEW | SX_NOWITNESS) #else #define MUTEX_FLAGS (SX_DUPOK | SX_NEW) #endif #define mutex_init(lock, desc, type, arg) do { \ const char *_name; \ ASSERT((type) == MUTEX_DEFAULT); \ for (_name = #lock; *_name != '\0'; _name++) { \ if (*_name >= 'a' && *_name <= 'z') \ break; \ } \ if (*_name == '\0') \ _name = #lock; \ sx_init_flags((lock), _name, MUTEX_FLAGS); \ } while (0) #define mutex_destroy(lock) sx_destroy(lock) #define mutex_enter(lock) sx_xlock(lock) +#define mutex_enter_interruptible(lock) sx_xlock_sig(lock) #define mutex_enter_nested(lock, type) sx_xlock(lock) #define mutex_tryenter(lock) sx_try_xlock(lock) #define mutex_exit(lock) sx_xunlock(lock) #define mutex_owned(lock) sx_xlocked(lock) #define mutex_owner(lock) sx_xholder(lock) #endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/taskq.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/taskq.h index 30579b391711..b23a939b3aa7 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/taskq.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/taskq.h @@ -1,124 +1,124 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_TASKQ_H #define _SYS_TASKQ_H #ifdef _KERNEL #include #include +#include #include #include -#include #ifdef __cplusplus extern "C" { #endif #define TASKQ_NAMELEN 31 typedef struct taskq { struct taskqueue *tq_queue; } taskq_t; typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); typedef struct taskq_ent { - struct task tqent_task; - struct timeout_task tqent_timeout_task; + union { + struct task tqent_task; + struct timeout_task tqent_timeout_task; + }; task_func_t *tqent_func; void *tqent_arg; - taskqid_t tqent_id; - CK_LIST_ENTRY(taskq_ent) tqent_hash; - uint8_t tqent_type; - uint8_t tqent_registered; - uint8_t tqent_cancelled; - volatile uint32_t tqent_rc; + taskqid_t tqent_id; + LIST_ENTRY(taskq_ent) tqent_hash; + uint_t tqent_type; + volatile uint_t tqent_rc; } taskq_ent_t; /* * Public flags for taskq_create(): bit range 0-15 */ #define TASKQ_PREPOPULATE 0x0001 /* Prepopulate with threads and data */ #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ #define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */ #define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */ /* * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as * KM_SLEEP/KM_NOSLEEP. */ #define TQ_SLEEP 0x00 /* Can block for memory */ #define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */ #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ #define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */ #define TQ_FRONT 0x08 /* Put task at the front of the queue */ #define TASKQID_INVALID ((taskqid_t)0) #define taskq_init_ent(x) extern taskq_t *system_taskq; /* Global dynamic task queue for long delay */ extern taskq_t *system_delay_taskq; extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, clock_t); extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, struct proc *, uint_t); taskq_t *taskq_create_sysdc(const char *, int, int, int, struct proc *, uint_t, uint_t); void nulltask(void *); extern void taskq_destroy(taskq_t *); extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern void taskq_wait(taskq_t *); extern int taskq_cancel_id(taskq_t *, taskqid_t); extern int taskq_member(taskq_t *, kthread_t *); extern taskq_t *taskq_of_curthread(void); void taskq_suspend(taskq_t *); int taskq_suspended(taskq_t *); void taskq_resume(taskq_t *); #ifdef __cplusplus } #endif #endif /* _KERNEL */ #ifdef _STANDALONE typedef int taskq_ent_t; #define taskq_init_ent(x) #endif /* _STANDALONE */ #endif /* _SYS_TASKQ_H */ diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/vfs_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/vfs_compat.h index e156ed41c28c..aea8bd5ed22c 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/vfs_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/vfs_compat.h @@ -1,475 +1,481 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2011 Lawrence Livermore National Security, LLC. * Copyright (C) 2015 Jörg Thalheim. */ #ifndef _ZFS_VFS_H #define _ZFS_VFS_H #include #include #include #include /* * 2.6.34 - 3.19, bdi_setup_and_register() takes 3 arguments. * 4.0 - 4.11, bdi_setup_and_register() takes 2 arguments. * 4.12 - x.y, super_setup_bdi_name() new interface. */ #if defined(HAVE_SUPER_SETUP_BDI_NAME) extern atomic_long_t zfs_bdi_seq; static inline int zpl_bdi_setup(struct super_block *sb, char *name) { return super_setup_bdi_name(sb, "%.28s-%ld", name, atomic_long_inc_return(&zfs_bdi_seq)); } static inline void zpl_bdi_destroy(struct super_block *sb) { } #elif defined(HAVE_2ARGS_BDI_SETUP_AND_REGISTER) static inline int zpl_bdi_setup(struct super_block *sb, char *name) { struct backing_dev_info *bdi; int error; bdi = kmem_zalloc(sizeof (struct backing_dev_info), KM_SLEEP); error = bdi_setup_and_register(bdi, name); if (error) { kmem_free(bdi, sizeof (struct backing_dev_info)); return (error); } sb->s_bdi = bdi; return (0); } static inline void zpl_bdi_destroy(struct super_block *sb) { struct backing_dev_info *bdi = sb->s_bdi; bdi_destroy(bdi); kmem_free(bdi, sizeof (struct backing_dev_info)); sb->s_bdi = NULL; } #elif defined(HAVE_3ARGS_BDI_SETUP_AND_REGISTER) static inline int zpl_bdi_setup(struct super_block *sb, char *name) { struct backing_dev_info *bdi; int error; bdi = kmem_zalloc(sizeof (struct backing_dev_info), KM_SLEEP); error = bdi_setup_and_register(bdi, name, BDI_CAP_MAP_COPY); if (error) { kmem_free(sb->s_bdi, sizeof (struct backing_dev_info)); return (error); } sb->s_bdi = bdi; return (0); } static inline void zpl_bdi_destroy(struct super_block *sb) { struct backing_dev_info *bdi = sb->s_bdi; bdi_destroy(bdi); kmem_free(bdi, sizeof (struct backing_dev_info)); sb->s_bdi = NULL; } #else #error "Unsupported kernel" #endif /* * 4.14 adds SB_* flag definitions, define them to MS_* equivalents * if not set. */ #ifndef SB_RDONLY #define SB_RDONLY MS_RDONLY #endif #ifndef SB_SILENT #define SB_SILENT MS_SILENT #endif #ifndef SB_ACTIVE #define SB_ACTIVE MS_ACTIVE #endif #ifndef SB_POSIXACL #define SB_POSIXACL MS_POSIXACL #endif #ifndef SB_MANDLOCK #define SB_MANDLOCK MS_MANDLOCK #endif #ifndef SB_NOATIME #define SB_NOATIME MS_NOATIME #endif /* * 3.5 API change, * The clear_inode() function replaces end_writeback() and introduces an * ordering change regarding when the inode_sync_wait() occurs. See the * configure check in config/kernel-clear-inode.m4 for full details. */ #if defined(HAVE_EVICT_INODE) && !defined(HAVE_CLEAR_INODE) #define clear_inode(ip) end_writeback(ip) #endif /* HAVE_EVICT_INODE && !HAVE_CLEAR_INODE */ #if defined(SEEK_HOLE) && defined(SEEK_DATA) && !defined(HAVE_LSEEK_EXECUTE) static inline loff_t lseek_execute( struct file *filp, struct inode *inode, loff_t offset, loff_t maxsize) { if (offset < 0 && !(filp->f_mode & FMODE_UNSIGNED_OFFSET)) return (-EINVAL); if (offset > maxsize) return (-EINVAL); if (offset != filp->f_pos) { spin_lock(&filp->f_lock); filp->f_pos = offset; filp->f_version = 0; spin_unlock(&filp->f_lock); } return (offset); } #endif /* SEEK_HOLE && SEEK_DATA && !HAVE_LSEEK_EXECUTE */ #if defined(CONFIG_FS_POSIX_ACL) /* * These functions safely approximates the behavior of posix_acl_release() * which cannot be used because it calls the GPL-only symbol kfree_rcu(). * The in-kernel version, which can access the RCU, frees the ACLs after * the grace period expires. Because we're unsure how long that grace * period may be this implementation conservatively delays for 60 seconds. * This is several orders of magnitude larger than expected grace period. * At 60 seconds the kernel will also begin issuing RCU stall warnings. */ #include #if defined(HAVE_POSIX_ACL_RELEASE) && !defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) #define zpl_posix_acl_release(arg) posix_acl_release(arg) #else void zpl_posix_acl_release_impl(struct posix_acl *); static inline void zpl_posix_acl_release(struct posix_acl *acl) { if ((acl == NULL) || (acl == ACL_NOT_CACHED)) return; #ifdef HAVE_ACL_REFCOUNT if (refcount_dec_and_test(&acl->a_refcount)) zpl_posix_acl_release_impl(acl); #else if (atomic_dec_and_test(&acl->a_refcount)) zpl_posix_acl_release_impl(acl); #endif } #endif /* HAVE_POSIX_ACL_RELEASE */ #ifdef HAVE_SET_CACHED_ACL_USABLE #define zpl_set_cached_acl(ip, ty, n) set_cached_acl(ip, ty, n) #define zpl_forget_cached_acl(ip, ty) forget_cached_acl(ip, ty) #else static inline void zpl_set_cached_acl(struct inode *ip, int type, struct posix_acl *newer) { struct posix_acl *older = NULL; spin_lock(&ip->i_lock); if ((newer != ACL_NOT_CACHED) && (newer != NULL)) posix_acl_dup(newer); switch (type) { case ACL_TYPE_ACCESS: older = ip->i_acl; rcu_assign_pointer(ip->i_acl, newer); break; case ACL_TYPE_DEFAULT: older = ip->i_default_acl; rcu_assign_pointer(ip->i_default_acl, newer); break; } spin_unlock(&ip->i_lock); zpl_posix_acl_release(older); } static inline void zpl_forget_cached_acl(struct inode *ip, int type) { zpl_set_cached_acl(ip, type, (struct posix_acl *)ACL_NOT_CACHED); } #endif /* HAVE_SET_CACHED_ACL_USABLE */ /* * 3.1 API change, * posix_acl_chmod() was added as the preferred interface. * * 3.14 API change, * posix_acl_chmod() was changed to __posix_acl_chmod() */ #ifndef HAVE___POSIX_ACL_CHMOD #ifdef HAVE_POSIX_ACL_CHMOD #define __posix_acl_chmod(acl, gfp, mode) posix_acl_chmod(acl, gfp, mode) #define __posix_acl_create(acl, gfp, mode) posix_acl_create(acl, gfp, mode) #else #error "Unsupported kernel" #endif /* HAVE_POSIX_ACL_CHMOD */ #endif /* HAVE___POSIX_ACL_CHMOD */ /* * 4.8 API change, * posix_acl_valid() now must be passed a namespace, the namespace from * from super block associated with the given inode is used for this purpose. */ #ifdef HAVE_POSIX_ACL_VALID_WITH_NS #define zpl_posix_acl_valid(ip, acl) posix_acl_valid(ip->i_sb->s_user_ns, acl) #else #define zpl_posix_acl_valid(ip, acl) posix_acl_valid(acl) #endif #endif /* CONFIG_FS_POSIX_ACL */ /* * 3.19 API change * struct access f->f_dentry->d_inode was replaced by accessor function * file_inode(f) */ #ifndef HAVE_FILE_INODE static inline struct inode *file_inode(const struct file *f) { return (f->f_dentry->d_inode); } #endif /* HAVE_FILE_INODE */ /* * 4.1 API change * struct access file->f_path.dentry was replaced by accessor function * file_dentry(f) */ #ifndef HAVE_FILE_DENTRY static inline struct dentry *file_dentry(const struct file *f) { return (f->f_path.dentry); } #endif /* HAVE_FILE_DENTRY */ static inline uid_t zfs_uid_read_impl(struct inode *ip) { return (from_kuid(kcred->user_ns, ip->i_uid)); } static inline uid_t zfs_uid_read(struct inode *ip) { return (zfs_uid_read_impl(ip)); } static inline gid_t zfs_gid_read_impl(struct inode *ip) { return (from_kgid(kcred->user_ns, ip->i_gid)); } static inline gid_t zfs_gid_read(struct inode *ip) { return (zfs_gid_read_impl(ip)); } static inline void zfs_uid_write(struct inode *ip, uid_t uid) { ip->i_uid = make_kuid(kcred->user_ns, uid); } static inline void zfs_gid_write(struct inode *ip, gid_t gid) { ip->i_gid = make_kgid(kcred->user_ns, gid); } /* * 3.15 API change */ #ifndef RENAME_NOREPLACE #define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ #endif #ifndef RENAME_EXCHANGE #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ #endif #ifndef RENAME_WHITEOUT #define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ #endif /* * 4.9 API change */ #if !(defined(HAVE_SETATTR_PREPARE_NO_USERNS) || \ defined(HAVE_SETATTR_PREPARE_USERNS) || \ defined(HAVE_SETATTR_PREPARE_IDMAP)) static inline int setattr_prepare(struct dentry *dentry, struct iattr *ia) { return (inode_change_ok(dentry->d_inode, ia)); } #endif /* * 4.11 API change * These macros are defined by kernel 4.11. We define them so that the same * code builds under kernels < 4.11 and >= 4.11. The macros are set to 0 so * that it will create obvious failures if they are accidentally used when built * against a kernel >= 4.11. */ #ifndef STATX_BASIC_STATS #define STATX_BASIC_STATS 0 #endif #ifndef AT_STATX_SYNC_AS_STAT #define AT_STATX_SYNC_AS_STAT 0 #endif /* * 4.11 API change * 4.11 takes struct path *, < 4.11 takes vfsmount * */ #ifdef HAVE_VFSMOUNT_IOPS_GETATTR #define ZPL_GETATTR_WRAPPER(func) \ static int \ func(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) \ { \ struct path path = { .mnt = mnt, .dentry = dentry }; \ return func##_impl(&path, stat, STATX_BASIC_STATS, \ AT_STATX_SYNC_AS_STAT); \ } #elif defined(HAVE_PATH_IOPS_GETATTR) #define ZPL_GETATTR_WRAPPER(func) \ static int \ func(const struct path *path, struct kstat *stat, u32 request_mask, \ unsigned int query_flags) \ { \ return (func##_impl(path, stat, request_mask, query_flags)); \ } #elif defined(HAVE_USERNS_IOPS_GETATTR) #define ZPL_GETATTR_WRAPPER(func) \ static int \ func(struct user_namespace *user_ns, const struct path *path, \ struct kstat *stat, u32 request_mask, unsigned int query_flags) \ { \ return (func##_impl(user_ns, path, stat, request_mask, \ query_flags)); \ } #elif defined(HAVE_IDMAP_IOPS_GETATTR) #define ZPL_GETATTR_WRAPPER(func) \ static int \ func(struct mnt_idmap *user_ns, const struct path *path, \ struct kstat *stat, u32 request_mask, unsigned int query_flags) \ { \ return (func##_impl(user_ns, path, stat, request_mask, \ query_flags)); \ } #else #error #endif /* * 4.9 API change * Preferred interface to get the current FS time. */ #if !defined(HAVE_CURRENT_TIME) static inline struct timespec current_time(struct inode *ip) { return (timespec_trunc(current_kernel_time(), ip->i_sb->s_time_gran)); } #endif /* * 4.16 API change * Added iversion interface for managing inode version field. */ #ifdef HAVE_INODE_SET_IVERSION #include #else static inline void inode_set_iversion(struct inode *ip, u64 val) { ip->i_version = val; } #endif /* * Returns true when called in the context of a 32-bit system call. */ static inline int zpl_is_32bit_api(void) { #ifdef CONFIG_COMPAT #ifdef HAVE_IN_COMPAT_SYSCALL return (in_compat_syscall()); #else return (is_compat_task()); #endif #else return (BITS_PER_LONG == 32); #endif } /* * 5.12 API change * To support id-mapped mounts, generic_fillattr() was modified to * accept a new struct user_namespace* as its first arg. * * 6.3 API change * generic_fillattr() first arg is changed to struct mnt_idmap * * + * 6.6 API change + * generic_fillattr() gets new second arg request_mask, a u32 type + * */ #ifdef HAVE_GENERIC_FILLATTR_IDMAP #define zpl_generic_fillattr(idmap, ip, sp) \ generic_fillattr(idmap, ip, sp) +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) +#define zpl_generic_fillattr(idmap, rqm, ip, sp) \ + generic_fillattr(idmap, rqm, ip, sp) #elif defined(HAVE_GENERIC_FILLATTR_USERNS) #define zpl_generic_fillattr(user_ns, ip, sp) \ generic_fillattr(user_ns, ip, sp) #else #define zpl_generic_fillattr(user_ns, ip, sp) generic_fillattr(ip, sp) #endif #endif /* _ZFS_VFS_H */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem_cache.h b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem_cache.h index 20eeadc46e10..82d50b6034c4 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem_cache.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem_cache.h @@ -1,220 +1,220 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_KMEM_CACHE_H #define _SPL_KMEM_CACHE_H #include /* * Slab allocation interfaces. The SPL slab differs from the standard * Linux SLAB or SLUB primarily in that each cache may be backed by slabs * allocated from the physical or virtual memory address space. The virtual * slabs allow for good behavior when allocation large objects of identical * size. This slab implementation also supports both constructors and * destructors which the Linux slab does not. */ typedef enum kmc_bit { KMC_BIT_NODEBUG = 1, /* Default behavior */ KMC_BIT_KVMEM = 7, /* Use kvmalloc linux allocator */ KMC_BIT_SLAB = 8, /* Use Linux slab cache */ KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ KMC_BIT_GROWING = 15, /* Growing in progress */ KMC_BIT_REAPING = 16, /* Reaping in progress */ KMC_BIT_DESTROY = 17, /* Destroy in progress */ KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ KMC_BIT_MAX = 20, /* Proc handler helper bit */ } kmc_bit_t; /* kmem move callback return values */ typedef enum kmem_cbrc { KMEM_CBRC_YES = 0, /* Object moved */ KMEM_CBRC_NO = 1, /* Object not moved */ KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ } kmem_cbrc_t; #define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) #define KMC_KVMEM (1 << KMC_BIT_KVMEM) #define KMC_SLAB (1 << KMC_BIT_SLAB) #define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) #define KMC_GROWING (1 << KMC_BIT_GROWING) #define KMC_REAPING (1 << KMC_BIT_REAPING) #define KMC_DESTROY (1 << KMC_BIT_DESTROY) #define KMC_TOTAL (1 << KMC_BIT_TOTAL) #define KMC_ALLOC (1 << KMC_BIT_ALLOC) #define KMC_MAX (1 << KMC_BIT_MAX) #define KMC_REAP_CHUNK INT_MAX #define KMC_DEFAULT_SEEKS 1 #define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ extern struct list_head spl_kmem_cache_list; extern struct rw_semaphore spl_kmem_cache_sem; #define SKM_MAGIC 0x2e2e2e2e #define SKO_MAGIC 0x20202020 #define SKS_MAGIC 0x22222222 #define SKC_MAGIC 0x2c2c2c2c #define SPL_KMEM_CACHE_OBJ_PER_SLAB 8 /* Target objects per slab */ #define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ #ifdef _LP64 #define SPL_KMEM_CACHE_MAX_SIZE 32 /* Max slab size in MB */ #else #define SPL_KMEM_CACHE_MAX_SIZE 4 /* Max slab size in MB */ #endif #define SPL_MAX_ORDER (MAX_ORDER - 3) #define SPL_MAX_ORDER_NR_PAGES (1 << (SPL_MAX_ORDER - 1)) #ifdef CONFIG_SLUB #define SPL_MAX_KMEM_CACHE_ORDER PAGE_ALLOC_COSTLY_ORDER #define SPL_MAX_KMEM_ORDER_NR_PAGES (1 << (SPL_MAX_KMEM_CACHE_ORDER - 1)) #else #define SPL_MAX_KMEM_ORDER_NR_PAGES (KMALLOC_MAX_SIZE >> PAGE_SHIFT) #endif typedef int (*spl_kmem_ctor_t)(void *, void *, int); typedef void (*spl_kmem_dtor_t)(void *, void *); typedef struct spl_kmem_magazine { uint32_t skm_magic; /* Sanity magic */ uint32_t skm_avail; /* Available objects */ uint32_t skm_size; /* Magazine size */ uint32_t skm_refill; /* Batch refill size */ struct spl_kmem_cache *skm_cache; /* Owned by cache */ unsigned int skm_cpu; /* Owned by cpu */ - void *skm_objs[0]; /* Object pointers */ + void *skm_objs[]; /* Object pointers */ } spl_kmem_magazine_t; typedef struct spl_kmem_obj { uint32_t sko_magic; /* Sanity magic */ void *sko_addr; /* Buffer address */ struct spl_kmem_slab *sko_slab; /* Owned by slab */ struct list_head sko_list; /* Free object list linkage */ } spl_kmem_obj_t; typedef struct spl_kmem_slab { uint32_t sks_magic; /* Sanity magic */ uint32_t sks_objs; /* Objects per slab */ struct spl_kmem_cache *sks_cache; /* Owned by cache */ struct list_head sks_list; /* Slab list linkage */ struct list_head sks_free_list; /* Free object list */ unsigned long sks_age; /* Last modify jiffie */ uint32_t sks_ref; /* Ref count used objects */ } spl_kmem_slab_t; typedef struct spl_kmem_alloc { struct spl_kmem_cache *ska_cache; /* Owned by cache */ int ska_flags; /* Allocation flags */ taskq_ent_t ska_tqe; /* Task queue entry */ } spl_kmem_alloc_t; typedef struct spl_kmem_emergency { struct rb_node ske_node; /* Emergency tree linkage */ unsigned long ske_obj; /* Buffer address */ } spl_kmem_emergency_t; typedef struct spl_kmem_cache { uint32_t skc_magic; /* Sanity magic */ uint32_t skc_name_size; /* Name length */ char *skc_name; /* Name string */ spl_kmem_magazine_t **skc_mag; /* Per-CPU warm cache */ uint32_t skc_mag_size; /* Magazine size */ uint32_t skc_mag_refill; /* Magazine refill count */ spl_kmem_ctor_t skc_ctor; /* Constructor */ spl_kmem_dtor_t skc_dtor; /* Destructor */ void *skc_private; /* Private data */ void *skc_vmp; /* Unused */ struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */ unsigned long skc_flags; /* Flags */ uint32_t skc_obj_size; /* Object size */ uint32_t skc_obj_align; /* Object alignment */ uint32_t skc_slab_objs; /* Objects per slab */ uint32_t skc_slab_size; /* Slab size */ atomic_t skc_ref; /* Ref count callers */ taskqid_t skc_taskqid; /* Slab reclaim task */ struct list_head skc_list; /* List of caches linkage */ struct list_head skc_complete_list; /* Completely alloc'ed */ struct list_head skc_partial_list; /* Partially alloc'ed */ struct rb_root skc_emergency_tree; /* Min sized objects */ spinlock_t skc_lock; /* Cache lock */ spl_wait_queue_head_t skc_waitq; /* Allocation waiters */ uint64_t skc_slab_fail; /* Slab alloc failures */ uint64_t skc_slab_create; /* Slab creates */ uint64_t skc_slab_destroy; /* Slab destroys */ uint64_t skc_slab_total; /* Slab total current */ uint64_t skc_slab_alloc; /* Slab alloc current */ uint64_t skc_slab_max; /* Slab max historic */ uint64_t skc_obj_total; /* Obj total current */ uint64_t skc_obj_alloc; /* Obj alloc current */ struct percpu_counter skc_linux_alloc; /* Linux-backed Obj alloc */ uint64_t skc_obj_max; /* Obj max historic */ uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */ uint64_t skc_obj_emergency; /* Obj emergency current */ uint64_t skc_obj_emergency_max; /* Obj emergency max */ } spl_kmem_cache_t; #define kmem_cache_t spl_kmem_cache_t extern spl_kmem_cache_t *spl_kmem_cache_create(const char *name, size_t size, size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim, void *priv, void *vmp, int flags); extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, kmem_cbrc_t (*)(void *, void *, size_t, void *)); extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags); extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc); extern void spl_kmem_reap(void); extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); #define kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \ spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) #define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) #define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) /* * This is necessary to be compatible with other kernel modules * or in-tree filesystem that may define kmem_cache_alloc, * like bcachefs does it now. */ #ifdef kmem_cache_alloc #undef kmem_cache_alloc #endif #define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) #define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) #define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc) #define kmem_reap() spl_kmem_reap() /* * The following functions are only available for internal use. */ extern int spl_kmem_cache_init(void); extern void spl_kmem_cache_fini(void); #endif /* _SPL_KMEM_CACHE_H */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h b/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h index 6b61c59c48e2..b4eaa0266d20 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h @@ -1,184 +1,189 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_MUTEX_H #define _SPL_MUTEX_H #include #include #include #include #include typedef enum { MUTEX_DEFAULT = 0, MUTEX_SPIN = 1, MUTEX_ADAPTIVE = 2, MUTEX_NOLOCKDEP = 3 } kmutex_type_t; typedef struct { struct mutex m_mutex; spinlock_t m_lock; /* used for serializing mutex_exit */ kthread_t *m_owner; #ifdef CONFIG_LOCKDEP kmutex_type_t m_type; #endif /* CONFIG_LOCKDEP */ } kmutex_t; #define MUTEX(mp) (&((mp)->m_mutex)) static inline void spl_mutex_set_owner(kmutex_t *mp) { mp->m_owner = current; } static inline void spl_mutex_clear_owner(kmutex_t *mp) { mp->m_owner = NULL; } #define mutex_owner(mp) (READ_ONCE((mp)->m_owner)) #define mutex_owned(mp) (mutex_owner(mp) == current) #define MUTEX_HELD(mp) mutex_owned(mp) #define MUTEX_NOT_HELD(mp) (!MUTEX_HELD(mp)) #ifdef CONFIG_LOCKDEP static inline void spl_mutex_set_type(kmutex_t *mp, kmutex_type_t type) { mp->m_type = type; } static inline void spl_mutex_lockdep_off_maybe(kmutex_t *mp) \ { \ if (mp && mp->m_type == MUTEX_NOLOCKDEP) \ lockdep_off(); \ } static inline void spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ { \ if (mp && mp->m_type == MUTEX_NOLOCKDEP) \ lockdep_on(); \ } #else /* CONFIG_LOCKDEP */ #define spl_mutex_set_type(mp, type) #define spl_mutex_lockdep_off_maybe(mp) #define spl_mutex_lockdep_on_maybe(mp) #endif /* CONFIG_LOCKDEP */ /* * The following functions must be a #define and not static inline. * This ensures that the native linux mutex functions (lock/unlock) * will be correctly located in the users code which is important * for the built in kernel lock analysis tools */ #undef mutex_init #define mutex_init(mp, name, type, ibc) \ { \ static struct lock_class_key __key; \ ASSERT(type == MUTEX_DEFAULT || type == MUTEX_NOLOCKDEP); \ \ __mutex_init(MUTEX(mp), (name) ? (#name) : (#mp), &__key); \ spin_lock_init(&(mp)->m_lock); \ spl_mutex_clear_owner(mp); \ spl_mutex_set_type(mp, type); \ } #undef mutex_destroy #define mutex_destroy(mp) \ { \ VERIFY3P(mutex_owner(mp), ==, NULL); \ } #define mutex_tryenter(mp) \ /* CSTYLED */ \ ({ \ int _rc_; \ \ spl_mutex_lockdep_off_maybe(mp); \ if ((_rc_ = mutex_trylock(MUTEX(mp))) == 1) \ spl_mutex_set_owner(mp); \ spl_mutex_lockdep_on_maybe(mp); \ \ _rc_; \ }) #define NESTED_SINGLE 1 -#ifdef CONFIG_DEBUG_LOCK_ALLOC #define mutex_enter_nested(mp, subclass) \ { \ ASSERT3P(mutex_owner(mp), !=, current); \ spl_mutex_lockdep_off_maybe(mp); \ mutex_lock_nested(MUTEX(mp), (subclass)); \ spl_mutex_lockdep_on_maybe(mp); \ spl_mutex_set_owner(mp); \ } -#else /* CONFIG_DEBUG_LOCK_ALLOC */ -#define mutex_enter_nested(mp, subclass) \ -{ \ + +#define mutex_enter_interruptible(mp) \ +/* CSTYLED */ \ +({ \ + int _rc_; \ + \ ASSERT3P(mutex_owner(mp), !=, current); \ spl_mutex_lockdep_off_maybe(mp); \ - mutex_lock(MUTEX(mp)); \ + _rc_ = mutex_lock_interruptible(MUTEX(mp)); \ spl_mutex_lockdep_on_maybe(mp); \ - spl_mutex_set_owner(mp); \ -} -#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + if (!_rc_) { \ + spl_mutex_set_owner(mp); \ + } \ + \ + _rc_; \ +}) #define mutex_enter(mp) mutex_enter_nested((mp), 0) /* * The reason for the spinlock: * * The Linux mutex is designed with a fast-path/slow-path design such that it * does not guarantee serialization upon itself, allowing a race where latter * acquirers finish mutex_unlock before former ones. * * The race renders it unsafe to be used for serializing the freeing of an * object in which the mutex is embedded, where the latter acquirer could go * on to free the object while the former one is still doing mutex_unlock and * causing memory corruption. * * However, there are many places in ZFS where the mutex is used for * serializing object freeing, and the code is shared among other OSes without * this issue. Thus, we need the spinlock to force the serialization on * mutex_exit(). * * See http://lwn.net/Articles/575477/ for the information about the race. */ #define mutex_exit(mp) \ { \ ASSERT3P(mutex_owner(mp), ==, current); \ spl_mutex_clear_owner(mp); \ spin_lock(&(mp)->m_lock); \ spl_mutex_lockdep_off_maybe(mp); \ mutex_unlock(MUTEX(mp)); \ spl_mutex_lockdep_on_maybe(mp); \ spin_unlock(&(mp)->m_lock); \ /* NOTE: do not dereference mp after this point */ \ } #endif /* _SPL_MUTEX_H */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h index cce097e16fbc..a4b600004c9f 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h @@ -1,188 +1,180 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_UIO_H #define _SPL_UIO_H #include #include #include #include #include #include #include #include #if defined(HAVE_VFS_IOV_ITER) && defined(HAVE_FAULT_IN_IOV_ITER_READABLE) #define iov_iter_fault_in_readable(a, b) fault_in_iov_iter_readable(a, b) #endif typedef struct iovec iovec_t; typedef enum zfs_uio_rw { UIO_READ = 0, UIO_WRITE = 1, } zfs_uio_rw_t; typedef enum zfs_uio_seg { UIO_USERSPACE = 0, UIO_SYSSPACE = 1, UIO_BVEC = 2, #if defined(HAVE_VFS_IOV_ITER) UIO_ITER = 3, #endif } zfs_uio_seg_t; typedef struct zfs_uio { union { const struct iovec *uio_iov; const struct bio_vec *uio_bvec; #if defined(HAVE_VFS_IOV_ITER) struct iov_iter *uio_iter; #endif }; int uio_iovcnt; offset_t uio_loffset; zfs_uio_seg_t uio_segflg; boolean_t uio_fault_disable; uint16_t uio_fmode; uint16_t uio_extflg; ssize_t uio_resid; size_t uio_skip; struct request *rq; - - /* - * Used for saving rq_for_each_segment() state between calls - * to zfs_uiomove_bvec_rq(). - */ - struct req_iterator iter; - struct bio_vec bv; } zfs_uio_t; #define zfs_uio_segflg(u) (u)->uio_segflg #define zfs_uio_offset(u) (u)->uio_loffset #define zfs_uio_resid(u) (u)->uio_resid #define zfs_uio_iovcnt(u) (u)->uio_iovcnt #define zfs_uio_iovlen(u, idx) (u)->uio_iov[(idx)].iov_len #define zfs_uio_iovbase(u, idx) (u)->uio_iov[(idx)].iov_base #define zfs_uio_fault_disable(u, set) (u)->uio_fault_disable = set #define zfs_uio_rlimit_fsize(z, u) (0) #define zfs_uio_fault_move(p, n, rw, u) zfs_uiomove((p), (n), (rw), (u)) extern int zfs_uio_prefaultpages(ssize_t, zfs_uio_t *); static inline void zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) { uio->uio_loffset = off; } static inline void zfs_uio_advance(zfs_uio_t *uio, size_t size) { uio->uio_resid -= size; uio->uio_loffset += size; } static inline void zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov, unsigned long nr_segs, offset_t offset, zfs_uio_seg_t seg, ssize_t resid, size_t skip) { ASSERT(seg == UIO_USERSPACE || seg == UIO_SYSSPACE); uio->uio_iov = iov; uio->uio_iovcnt = nr_segs; uio->uio_loffset = offset; uio->uio_segflg = seg; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; } static inline void zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) { /* Either bio or rq will be set, but not both */ ASSERT3P(uio, !=, bio); if (bio) { uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; } else { uio->uio_bvec = NULL; uio->uio_iovcnt = 0; - memset(&uio->iter, 0, sizeof (uio->iter)); } uio->uio_loffset = io_offset(bio, rq); uio->uio_segflg = UIO_BVEC; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; uio->uio_resid = io_size(bio, rq); if (bio) { uio->uio_skip = BIO_BI_SKIP(bio); } else { uio->uio_skip = 0; } uio->rq = rq; } #if defined(HAVE_VFS_IOV_ITER) static inline void zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, ssize_t resid, size_t skip) { uio->uio_iter = iter; uio->uio_iovcnt = iter->nr_segs; uio->uio_loffset = offset; uio->uio_segflg = UIO_ITER; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; } #endif #if defined(HAVE_ITER_IOV) #define zfs_uio_iter_iov(iter) iter_iov((iter)) #else #define zfs_uio_iter_iov(iter) (iter)->iov #endif #if defined(HAVE_IOV_ITER_TYPE) #define zfs_uio_iov_iter_type(iter) iov_iter_type((iter)) #else #define zfs_uio_iov_iter_type(iter) (iter)->type #endif #endif /* SPL_UIO_H */ diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h index b4d5db21f5e5..220466550258 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -1,255 +1,257 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct zfsvfs zfsvfs_t; struct znode; +extern int zfs_bclone_enabled; + /* * This structure emulates the vfs_t from other platforms. It's purpose * is to facilitate the handling of mount options and minimize structural * differences between the platforms. */ typedef struct vfs { struct zfsvfs *vfs_data; char *vfs_mntpoint; /* Primary mount point */ uint64_t vfs_xattr; boolean_t vfs_readonly; boolean_t vfs_do_readonly; boolean_t vfs_setuid; boolean_t vfs_do_setuid; boolean_t vfs_exec; boolean_t vfs_do_exec; boolean_t vfs_devices; boolean_t vfs_do_devices; boolean_t vfs_do_xattr; boolean_t vfs_atime; boolean_t vfs_do_atime; boolean_t vfs_relatime; boolean_t vfs_do_relatime; boolean_t vfs_nbmand; boolean_t vfs_do_nbmand; } vfs_t; typedef struct zfs_mnt { const char *mnt_osname; /* Objset name */ char *mnt_data; /* Raw mount options */ } zfs_mnt_t; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ struct super_block *z_sb; /* generic super_block */ struct zfsvfs *z_parent; /* parent fs */ objset_t *z_os; /* objset reference */ uint64_t z_flags; /* super_block flags */ uint64_t z_root; /* id of root znode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ uint_t z_acl_type; /* type of ACL usable on this FS */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ int z_norm; /* normalization flags */ boolean_t z_relatime; /* enable relatime mount option */ boolean_t z_unmounted; /* unmounted */ rrmlock_t z_teardown_lock; krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all znodes in the fs */ unsigned long z_rollback_time; /* last online rollback time */ unsigned long z_snap_defer_time; /* last snapshot unmount deferral */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ arc_prune_t *z_arc_prune; /* called by ARC to prune caches */ struct inode *z_ctldir; /* .zfs directory inode */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ boolean_t z_use_sa; /* version allow system attributes */ boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ boolean_t z_draining; /* is true when drain is active */ boolean_t z_drain_cancel; /* signal the unlinked drain to stop */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ dataset_kstats_t z_kstat; /* fs kstats */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; uint64_t z_userobjquota_obj; uint64_t z_groupobjquota_obj; uint64_t z_projectquota_obj; uint64_t z_projectobjquota_obj; uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ uint64_t z_hold_size; /* znode hold array size */ avl_tree_t *z_hold_trees; /* znode hold trees */ kmutex_t *z_hold_locks; /* znode hold locks */ taskqid_t z_drain_task; /* task id for the unlink drain task */ }; #define ZFS_TEARDOWN_INIT(zfsvfs) \ rrm_init(&(zfsvfs)->z_teardown_lock, B_FALSE) #define ZFS_TEARDOWN_DESTROY(zfsvfs) \ rrm_destroy(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ rrm_enter_read(&(zfsvfs)->z_teardown_lock, tag); #define ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, tag) \ rrm_enter(&(zfsvfs)->z_teardown_lock, RW_WRITER, tag) #define ZFS_TEARDOWN_EXIT_WRITE(zfsvfs) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_EXIT(zfsvfs, tag) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_READ_HELD(zfsvfs) \ RRM_READ_HELD(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_WRITE_HELD(zfsvfs) \ RRM_WRITE_HELD(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_HELD(zfsvfs) \ RRM_LOCK_HELD(&(zfsvfs)->z_teardown_lock) #define ZSB_XATTR 0x0001 /* Enable user xattrs */ /* * Allow a maximum number of links. While ZFS does not internally limit * this the inode->i_nlink member is defined as an unsigned int. To be * safe we use 2^31-1 as the limit. */ #define ZFS_LINK_MAX ((1U << 31) - 1U) /* * Normal filesystems (those not under .zfs/snapshot) have a total * file ID size limited to 12 bytes (including the length field) due to * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical * reasons, this same limit is being imposed by the Solaris NFSv3 implementation * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It * is not possible to expand beyond 12 bytes without abandoning support * of NFSv2. * * For normal filesystems, we partition up the available space as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * * We reserve only 48 bits for the object number, as this is the limit * currently defined and imposed by the DMU. */ typedef struct zfid_short { uint16_t zf_len; uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_short_t; /* * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes * (including the length field). This makes files under .zfs/snapshot * accessible by NFSv3 and NFSv4, but not NFSv2. * * For files under .zfs/snapshot, we partition up the available space * as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * 6 bytes objset id (48 bits) * 4 bytes currently just zero (32 bits) * * We reserve only 48 bits for the object number and objset id, as these are * the limits currently defined and imposed by the DMU. */ typedef struct zfid_long { zfid_short_t z_fid; uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_long_t; #define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern void zfs_init(void); extern void zfs_fini(void); extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern void zfs_exit_fs(zfsvfs_t *zfsvfs); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); extern int zfsvfs_create(const char *name, boolean_t readony, zfsvfs_t **zfvp); extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); extern void zfsvfs_free(zfsvfs_t *zfsvfs); extern int zfs_check_global_label(const char *dsname, const char *hexsl); extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); extern int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent); extern void zfs_preumount(struct super_block *sb); extern int zfs_umount(struct super_block *sb); extern int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm); extern int zfs_statvfs(struct inode *ip, struct kstatfs *statp); extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp); extern int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects); extern int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint); #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vnops_os.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vnops_os.h index 7a1db7deeec8..830c76e5743a 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vnops_os.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vnops_os.h @@ -1,86 +1,91 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FS_ZFS_VNOPS_OS_H #define _SYS_FS_ZFS_VNOPS_OS_H #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif extern int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr); extern int zfs_close(struct inode *ip, int flag, cred_t *cr); extern int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *resid); extern int zfs_lookup(znode_t *dzp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp); extern int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns); extern int zfs_tmpfile(struct inode *dip, vattr_t *vapzfs, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns); extern int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags); extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns); extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags); extern int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr); +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK +extern int zfs_getattr_fast(zidmap_t *, u32 request_mask, struct inode *ip, + struct kstat *sp); +#else extern int zfs_getattr_fast(zidmap_t *, struct inode *ip, struct kstat *sp); +#endif extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr, zidmap_t *mnt_ns); extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns); extern int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns); extern int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr); extern int zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, int flags); extern void zfs_inactive(struct inode *ip); extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr); extern int zfs_fid(struct inode *ip, fid_t *fidp); extern int zfs_getpage(struct inode *ip, struct page *pp); extern int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, boolean_t for_sync); extern int zfs_dirty_inode(struct inode *ip, int flags); extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags); extern void zfs_zrele_async(znode_t *zp); #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_VNOPS_H */ diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h index 0bd20f64897d..9b729be6d74d 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h @@ -1,266 +1,277 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. */ #ifndef _SYS_ZPL_H #define _SYS_ZPL_H #include #include #include #include #include #include #include #include #include #include #include /* zpl_inode.c */ extern void zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr, zidmap_t *mnt_ns); extern const struct inode_operations zpl_inode_operations; #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER extern const struct inode_operations_wrapper zpl_dir_inode_operations; #else extern const struct inode_operations zpl_dir_inode_operations; #endif extern const struct inode_operations zpl_symlink_inode_operations; extern const struct inode_operations zpl_special_inode_operations; /* zpl_file.c */ extern const struct address_space_operations zpl_address_space_operations; #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND extern const struct file_operations_extend zpl_file_operations; #else extern const struct file_operations zpl_file_operations; #endif extern const struct file_operations zpl_dir_file_operations; /* zpl_super.c */ -extern void zpl_prune_sb(int64_t nr_to_scan, void *arg); +extern void zpl_prune_sb(uint64_t nr_to_scan, void *arg); extern const struct super_operations zpl_super_operations; extern const struct export_operations zpl_export_operations; extern struct file_system_type zpl_fs_type; /* zpl_xattr.c */ extern ssize_t zpl_xattr_list(struct dentry *dentry, char *buf, size_t size); extern int zpl_xattr_security_init(struct inode *ip, struct inode *dip, const struct qstr *qstr); #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) #if defined(HAVE_SET_ACL_IDMAP_DENTRY) extern int zpl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); #elif defined(HAVE_SET_ACL_USERNS) extern int zpl_set_acl(struct user_namespace *userns, struct inode *ip, struct posix_acl *acl, int type); #elif defined(HAVE_SET_ACL_USERNS_DENTRY_ARG2) extern int zpl_set_acl(struct user_namespace *userns, struct dentry *dentry, struct posix_acl *acl, int type); #else extern int zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type); #endif /* HAVE_SET_ACL_USERNS */ #endif /* HAVE_SET_ACL */ #if defined(HAVE_GET_ACL_RCU) || defined(HAVE_GET_INODE_ACL) extern struct posix_acl *zpl_get_acl(struct inode *ip, int type, bool rcu); #elif defined(HAVE_GET_ACL) extern struct posix_acl *zpl_get_acl(struct inode *ip, int type); #endif extern int zpl_init_acl(struct inode *ip, struct inode *dir); extern int zpl_chmod_acl(struct inode *ip); #else static inline int zpl_init_acl(struct inode *ip, struct inode *dir) { return (0); } static inline int zpl_chmod_acl(struct inode *ip) { return (0); } #endif /* CONFIG_FS_POSIX_ACL */ extern xattr_handler_t *zpl_xattr_handlers[]; /* zpl_ctldir.c */ extern const struct file_operations zpl_fops_root; extern const struct inode_operations zpl_ops_root; extern const struct file_operations zpl_fops_snapdir; extern const struct inode_operations zpl_ops_snapdir; extern const struct file_operations zpl_fops_shares; extern const struct inode_operations zpl_ops_shares; #if defined(HAVE_VFS_ITERATE) || defined(HAVE_VFS_ITERATE_SHARED) #define ZPL_DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ .actor = _actor, \ .pos = _pos, \ } typedef struct dir_context zpl_dir_context_t; #define zpl_dir_emit dir_emit #define zpl_dir_emit_dot dir_emit_dot #define zpl_dir_emit_dotdot dir_emit_dotdot #define zpl_dir_emit_dots dir_emit_dots #else typedef struct zpl_dir_context { void *dirent; const filldir_t actor; loff_t pos; } zpl_dir_context_t; #define ZPL_DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ .dirent = _dirent, \ .actor = _actor, \ .pos = _pos, \ } static inline bool zpl_dir_emit(zpl_dir_context_t *ctx, const char *name, int namelen, uint64_t ino, unsigned type) { return (!ctx->actor(ctx->dirent, name, namelen, ctx->pos, ino, type)); } static inline bool zpl_dir_emit_dot(struct file *file, zpl_dir_context_t *ctx) { return (ctx->actor(ctx->dirent, ".", 1, ctx->pos, file_inode(file)->i_ino, DT_DIR) == 0); } static inline bool zpl_dir_emit_dotdot(struct file *file, zpl_dir_context_t *ctx) { return (ctx->actor(ctx->dirent, "..", 2, ctx->pos, parent_ino(file_dentry(file)), DT_DIR) == 0); } static inline bool zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx) { if (ctx->pos == 0) { if (!zpl_dir_emit_dot(file, ctx)) return (false); ctx->pos = 1; } if (ctx->pos == 1) { if (!zpl_dir_emit_dotdot(file, ctx)) return (false); ctx->pos = 2; } return (true); } #endif /* HAVE_VFS_ITERATE */ /* zpl_file_range.c */ /* handlers for file_operations of the same name */ extern ssize_t zpl_copy_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags); extern loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags); extern int zpl_clone_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, uint64_t len); extern int zpl_dedupe_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, uint64_t len); /* compat for FICLONE/FICLONERANGE/FIDEDUPERANGE ioctls */ typedef struct { int64_t fcr_src_fd; uint64_t fcr_src_offset; uint64_t fcr_src_length; uint64_t fcr_dest_offset; } zfs_ioc_compat_file_clone_range_t; typedef struct { int64_t fdri_dest_fd; uint64_t fdri_dest_offset; uint64_t fdri_bytes_deduped; int32_t fdri_status; uint32_t fdri_reserved; } zfs_ioc_compat_dedupe_range_info_t; typedef struct { uint64_t fdr_src_offset; uint64_t fdr_src_length; uint16_t fdr_dest_count; uint16_t fdr_reserved1; uint32_t fdr_reserved2; zfs_ioc_compat_dedupe_range_info_t fdr_info[]; } zfs_ioc_compat_dedupe_range_t; #define ZFS_IOC_COMPAT_FICLONE _IOW(0x94, 9, int) #define ZFS_IOC_COMPAT_FICLONERANGE \ _IOW(0x94, 13, zfs_ioc_compat_file_clone_range_t) #define ZFS_IOC_COMPAT_FIDEDUPERANGE \ _IOWR(0x94, 54, zfs_ioc_compat_dedupe_range_t) extern long zpl_ioctl_ficlone(struct file *filp, void *arg); extern long zpl_ioctl_ficlonerange(struct file *filp, void *arg); extern long zpl_ioctl_fideduperange(struct file *filp, void *arg); #if defined(HAVE_INODE_TIMESTAMP_TRUNCATE) #define zpl_inode_timestamp_truncate(ts, ip) timestamp_truncate(ts, ip) #elif defined(HAVE_INODE_TIMESPEC64_TIMES) #define zpl_inode_timestamp_truncate(ts, ip) \ timespec64_trunc(ts, (ip)->i_sb->s_time_gran) #else #define zpl_inode_timestamp_truncate(ts, ip) \ timespec_trunc(ts, (ip)->i_sb->s_time_gran) #endif #if defined(HAVE_INODE_OWNER_OR_CAPABLE) #define zpl_inode_owner_or_capable(ns, ip) inode_owner_or_capable(ip) #elif defined(HAVE_INODE_OWNER_OR_CAPABLE_USERNS) #define zpl_inode_owner_or_capable(ns, ip) inode_owner_or_capable(ns, ip) #elif defined(HAVE_INODE_OWNER_OR_CAPABLE_IDMAP) #define zpl_inode_owner_or_capable(idmap, ip) inode_owner_or_capable(idmap, ip) #else #error "Unsupported kernel" #endif #if defined(HAVE_SETATTR_PREPARE_USERNS) || defined(HAVE_SETATTR_PREPARE_IDMAP) #define zpl_setattr_prepare(ns, dentry, ia) setattr_prepare(ns, dentry, ia) #else /* * Use kernel-provided version, or our own from * linux/vfs_compat.h */ #define zpl_setattr_prepare(ns, dentry, ia) setattr_prepare(dentry, ia) #endif +#ifdef HAVE_INODE_GET_CTIME +#define zpl_inode_get_ctime(ip) inode_get_ctime(ip) +#else +#define zpl_inode_get_ctime(ip) (ip->i_ctime) +#endif +#ifdef HAVE_INODE_SET_CTIME_TO_TS +#define zpl_inode_set_ctime_to_ts(ip, ts) inode_set_ctime_to_ts(ip, ts) +#else +#define zpl_inode_set_ctime_to_ts(ip, ts) (ip->i_ctime = ts) +#endif + #endif /* _SYS_ZPL_H */ diff --git a/sys/contrib/openzfs/include/sys/arc.h b/sys/contrib/openzfs/include/sys/arc.h index 9d67dab06ca3..05307aab99e3 100644 --- a/sys/contrib/openzfs/include/sys/arc.h +++ b/sys/contrib/openzfs/include/sys/arc.h @@ -1,349 +1,349 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. */ #ifndef _SYS_ARC_H #define _SYS_ARC_H #include #ifdef __cplusplus extern "C" { #endif #include #include #include #include /* * Used by arc_flush() to inform arc_evict_state() that it should evict * all available buffers from the arc state being passed in. */ #define ARC_EVICT_ALL UINT64_MAX /* * ZFS gets very unhappy when the maximum ARC size is smaller than the maximum * block size and a larger block is written. To leave some safety margin, we * limit the minimum for zfs_arc_max to the maximium transaction size. */ #define MIN_ARC_MAX DMU_MAX_ACCESS #define HDR_SET_LSIZE(hdr, x) do { \ ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ } while (0) #define HDR_SET_PSIZE(hdr, x) do { \ ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ } while (0) #define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) #define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef struct arc_prune arc_prune_t; /* * Because the ARC can store encrypted data, errors (not due to bugs) may arise * while transforming data into its desired format - specifically, when * decrypting, the key may not be present, or the HMAC may not be correct * which signifies deliberate tampering with the on-disk state * (assuming that the checksum was correct). If any error occurs, the "buf" * parameter will be NULL. */ typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *priv); typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); -typedef void arc_prune_func_t(int64_t bytes, void *priv); +typedef void arc_prune_func_t(uint64_t bytes, void *priv); /* Shared module parameters */ extern uint_t zfs_arc_average_blocksize; extern int l2arc_exclude_special; /* generic arc_done_func_t's which you can use */ arc_read_done_func_t arc_bcopy_func; arc_read_done_func_t arc_getbuf_func; /* generic arc_prune_func_t wrapper for callbacks */ struct arc_prune { arc_prune_func_t *p_pfunc; void *p_private; uint64_t p_adjust; list_node_t p_node; zfs_refcount_t p_refcnt; }; typedef enum arc_strategy { ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */ ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */ } arc_strategy_t; typedef enum arc_flags { /* * Public flags that can be passed into the ARC by external consumers. */ ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ ARC_FLAG_UNCACHED = 1 << 5, /* evict after use */ ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_hdr_buf_t. These flags should * only be set by ARC code. */ ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */ /* Indicates that block was read with ASYNC priority. */ ARC_FLAG_PRIO_ASYNC_READ = 1 << 11, ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */ ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */ ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */ /* * Encrypted or authenticated on disk (may be plaintext in memory). * This header has b_crypt_hdr allocated. Does not include indirect * blocks with checksums of MACs which will also have their X * (encrypted) bit set in the bp. */ ARC_FLAG_PROTECTED = 1 << 15, /* data has not been authenticated yet */ ARC_FLAG_NOAUTH = 1 << 16, /* indicates that the buffer contains metadata (otherwise, data) */ ARC_FLAG_BUFC_METADATA = 1 << 17, /* Flags specifying whether optional hdr struct fields are defined */ ARC_FLAG_HAS_L1HDR = 1 << 18, ARC_FLAG_HAS_L2HDR = 1 << 19, /* * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. * This allows the l2arc to use the blkptr's checksum to verify * the data without having to store the checksum in the hdr. */ ARC_FLAG_COMPRESSED_ARC = 1 << 20, ARC_FLAG_SHARED_DATA = 1 << 21, /* * Fail this arc_read() (with ENOENT) if the data is not already present * in cache. */ ARC_FLAG_CACHED_ONLY = 1 << 22, /* * Don't instantiate an arc_buf_t for arc_read_done. */ ARC_FLAG_NO_BUF = 1 << 23, /* * The arc buffer's compression mode is stored in the top 7 bits of the * flags field, so these dummy flags are included so that MDB can * interpret the enum properly. */ ARC_FLAG_COMPRESS_0 = 1 << 24, ARC_FLAG_COMPRESS_1 = 1 << 25, ARC_FLAG_COMPRESS_2 = 1 << 26, ARC_FLAG_COMPRESS_3 = 1 << 27, ARC_FLAG_COMPRESS_4 = 1 << 28, ARC_FLAG_COMPRESS_5 = 1 << 29, ARC_FLAG_COMPRESS_6 = 1 << 30 } arc_flags_t; typedef enum arc_buf_flags { ARC_BUF_FLAG_SHARED = 1 << 0, ARC_BUF_FLAG_COMPRESSED = 1 << 1, /* * indicates whether this arc_buf_t is encrypted, regardless of * state on-disk */ ARC_BUF_FLAG_ENCRYPTED = 1 << 2 } arc_buf_flags_t; struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; void *b_data; arc_buf_flags_t b_flags; }; typedef enum arc_buf_contents { ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES } arc_buf_contents_t; /* * The following breakdowns of arc_size exist for kstat only. */ typedef enum arc_space_type { ARC_SPACE_DATA, ARC_SPACE_META, ARC_SPACE_HDRS, ARC_SPACE_L2HDRS, ARC_SPACE_DBUF, ARC_SPACE_DNODE, ARC_SPACE_BONUS, ARC_SPACE_ABD_CHUNK_WASTE, ARC_SPACE_NUMTYPES } arc_space_type_t; typedef enum arc_state_type { ARC_STATE_ANON, ARC_STATE_MRU, ARC_STATE_MRU_GHOST, ARC_STATE_MFU, ARC_STATE_MFU_GHOST, ARC_STATE_L2C_ONLY, ARC_STATE_UNCACHED, ARC_STATE_NUMTYPES } arc_state_type_t; typedef struct arc_buf_info { arc_state_type_t abi_state_type; arc_buf_contents_t abi_state_contents; uint32_t abi_flags; uint32_t abi_bufcnt; uint64_t abi_size; uint64_t abi_spa; uint64_t abi_access; uint32_t abi_mru_hits; uint32_t abi_mru_ghost_hits; uint32_t abi_mfu_hits; uint32_t abi_mfu_ghost_hits; uint32_t abi_l2arc_hits; uint32_t abi_holds; uint64_t abi_l2arc_dattr; uint64_t abi_l2arc_asize; enum zio_compress abi_l2arc_compress; } arc_buf_info_t; void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); boolean_t arc_is_metadata(arc_buf_t *buf); boolean_t arc_is_encrypted(arc_buf_t *buf); boolean_t arc_is_unauthenticated(arc_buf_t *buf); enum zio_compress arc_get_compression(arc_buf_t *buf); void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, uint8_t *iv, uint8_t *mac); int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, boolean_t in_place); void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac); arc_buf_t *arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type, int32_t size); arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); arc_buf_t *arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); uint8_t arc_get_complevel(arc_buf_t *buf); arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size); arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); arc_buf_t *arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); void arc_return_buf(arc_buf_t *buf, const void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, const void *tag); void arc_buf_destroy(arc_buf_t *buf, const void *tag); void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); uint64_t arc_buf_size(arc_buf_t *buf); uint64_t arc_buf_lsize(arc_buf_t *buf); void arc_buf_access(arc_buf_t *buf); void arc_release(arc_buf_t *buf, const void *tag); int arc_released(arc_buf_t *buf); void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #else #define arc_referenced(buf) ((void) sizeof (buf), 0) #endif int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *priv, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, arc_write_done_func_t *done, void *priv, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb); arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); void arc_remove_prune_callback(arc_prune_t *p); void arc_freed(spa_t *spa, const blkptr_t *bp); void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); uint64_t arc_all_memory(void); uint64_t arc_default_max(uint64_t min, uint64_t allmem); uint64_t arc_target_bytes(void); void arc_set_limits(uint64_t); void arc_init(void); void arc_fini(void); /* * Level 2 ARC */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check); void l2arc_init(void); void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); void l2arc_spa_rebuild_start(spa_t *spa); #ifndef _KERNEL extern boolean_t arc_watch; #endif #ifdef __cplusplus } #endif #endif /* _SYS_ARC_H */ diff --git a/sys/contrib/openzfs/include/sys/arc_impl.h b/sys/contrib/openzfs/include/sys/arc_impl.h index adff42c55d05..defebe3b2fbb 100644 --- a/sys/contrib/openzfs/include/sys/arc_impl.h +++ b/sys/contrib/openzfs/include/sys/arc_impl.h @@ -1,1093 +1,1092 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, Delphix. All rights reserved. * Copyright (c) 2013, Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. */ #ifndef _SYS_ARC_IMPL_H #define _SYS_ARC_IMPL_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recently used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache * ARC_uncached - uncacheable prefetch, to be evicted * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are * the only buffers that can be evicted or deleted. Within each * state there are multiple lists, one for meta-data and one for * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, * etc.) is tracked separately so that it can be managed more * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will acquire a DVA * as they are written and migrate onto the arc_mru list. * * The ARC_l2c_only state is for buffers that are in the second * level ARC but no longer in any of the ARC_m* lists. The second * level ARC itself may also contain buffers that are in any of * the ARC_m* states - meaning that a buffer can exist in two * places. The reason for the ARC_l2c_only state is to keep the * buffer header in the hash table, so that reads that hit the * second level ARC benefit from these fast lookups. */ typedef struct arc_state { /* * list of evictable buffers */ multilist_t arcs_list[ARC_BUFC_NUMTYPES]; /* * supports the "dbufs" kstat */ arc_state_type_t arcs_state; /* * total amount of data in this state. */ zfs_refcount_t arcs_size[ARC_BUFC_NUMTYPES] ____cacheline_aligned; /* * total amount of evictable data in this state */ zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * amount of hit bytes for this state (counted only for ghost states) */ wmsum_t arcs_hits[ARC_BUFC_NUMTYPES]; } arc_state_t; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_read_done_func_t *acb_done; arc_buf_t *acb_buf; boolean_t acb_encrypted; boolean_t acb_compressed; boolean_t acb_noauth; boolean_t acb_nobuf; boolean_t acb_wait; int acb_wait_error; kmutex_t acb_wait_lock; kcondvar_t acb_wait_cv; zbookmark_phys_t acb_zb; zio_t *acb_zio_dummy; zio_t *acb_zio_head; arc_callback_t *acb_prev; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { void *awcb_private; arc_write_done_func_t *awcb_ready; arc_write_done_func_t *awcb_children_ready; arc_write_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; /* * ARC buffers are separated into multiple structs as a memory saving measure: * - Common fields struct, always defined, and embedded within it: * - L2-only fields, always allocated but undefined when not in L2ARC * - L1-only fields, only allocated when in L1ARC * * Buffer in L1 Buffer only in L2 * +------------------------+ +------------------------+ * | arc_buf_hdr_t | | arc_buf_hdr_t | * | | | | * | | | | * | | | | * +------------------------+ +------------------------+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | * | (undefined if L1-only) | | | * +------------------------+ +------------------------+ * | l1arc_buf_hdr_t | * | | * | | * | | * | | * +------------------------+ * * Because it's possible for the L2ARC to become extremely large, we can wind * up eating a lot of memory in L2ARC buffer headers, so the size of a header * is minimized by only allocating the fields necessary for an L1-cached buffer * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple * words in pointers. arc_hdr_realloc() is used to switch a header between * these two allocation states. */ typedef struct l1arc_buf_hdr { /* protected by arc state mutex */ arc_state_t *b_state; multilist_node_t b_arc_node; /* protected by hash lock */ clock_t b_arc_access; uint32_t b_mru_hits; uint32_t b_mru_ghost_hits; uint32_t b_mfu_hits; uint32_t b_mfu_ghost_hits; uint8_t b_byteswap; arc_buf_t *b_buf; /* self protecting */ zfs_refcount_t b_refcnt; arc_callback_t *b_acb; abd_t *b_pabd; #ifdef ZFS_DEBUG zio_cksum_t *b_freeze_cksum; kmutex_t b_freeze_lock; #endif } l1arc_buf_hdr_t; typedef enum l2arc_dev_hdr_flags_t { L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ } l2arc_dev_hdr_flags_t; /* * Pointer used in persistent L2ARC (for pointing to log blocks). */ typedef struct l2arc_log_blkptr { /* * Offset of log block within the device, in bytes */ uint64_t lbp_daddr; /* * Aligned payload size (in bytes) of the log block */ uint64_t lbp_payload_asize; /* * Offset in bytes of the first buffer in the payload */ uint64_t lbp_payload_start; /* * lbp_prop has the following format: * * logical size (in bytes) * * aligned (after compression) size (in bytes) * * compression algorithm (we always LZ4-compress l2arc logs) * * checksum algorithm (used for lbp_cksum) */ uint64_t lbp_prop; zio_cksum_t lbp_cksum; /* checksum of log */ } l2arc_log_blkptr_t; /* * The persistent L2ARC device header. * Byte order of magic determines whether 64-bit bswap of fields is necessary. */ typedef struct l2arc_dev_hdr_phys { uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ uint64_t dh_version; /* Persistent L2ARC version */ /* * Global L2ARC device state and metadata. */ uint64_t dh_spa_guid; uint64_t dh_vdev_guid; uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ uint64_t dh_evict; /* evicted offset in bytes */ uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ /* * Used in zdb.c for determining if a log block is valid, in the same * way that l2arc_rebuild() does. */ uint64_t dh_start; /* mirror of l2ad_start */ uint64_t dh_end; /* mirror of l2ad_end */ /* * Start of log block chain. [0] -> newest log, [1] -> one older (used * for initiating prefetch). */ l2arc_log_blkptr_t dh_start_lbps[2]; /* * Aligned size of all log blocks as accounted by vdev_space_update(). */ uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ /* * Mirrors of vdev_trim_action_time and vdev_trim_state, used to * display when the cache device was fully trimmed for the last * time. */ uint64_t dh_trim_action_time; uint64_t dh_trim_state; const uint64_t dh_pad[30]; /* pad to 512 bytes */ zio_eck_t dh_tail; } l2arc_dev_hdr_phys_t; _Static_assert(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE, "l2arc_dev_hdr_phys_t wrong size"); /* * A single ARC buffer header entry in a l2arc_log_blk_phys_t. */ typedef struct l2arc_log_ent_phys { dva_t le_dva; /* dva of buffer */ uint64_t le_birth; /* birth txg of buffer */ /* * le_prop has the following format: * * logical size (in bytes) * * physical (compressed) size (in bytes) * * compression algorithm * * object type (used to restore arc_buf_contents_t) * * protected status (used for encryption) * * prefetch status (used in l2arc_read_done()) */ uint64_t le_prop; uint64_t le_daddr; /* buf location on l2dev */ uint64_t le_complevel; /* * We pad the size of each entry to a power of 2 so that the size of * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, * because of the L2ARC_SET_*SIZE macros. */ const uint64_t le_pad[2]; /* pad to 64 bytes */ } l2arc_log_ent_phys_t; #define L2ARC_LOG_BLK_MAX_ENTRIES (1022) /* * A log block of up to 1022 ARC buffer log entries, chained into the * persistent L2ARC metadata linked list. Byte order of magic determines * whether 64-bit bswap of fields is necessary. */ typedef struct l2arc_log_blk_phys { uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ /* * There are 2 chains (headed by dh_start_lbps[2]), and this field * points back to the previous block in this chain. We alternate * which chain we append to, so they are time-wise and offset-wise * interleaved, but that is an optimization rather than for * correctness. */ l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ /* * Pad header section to 128 bytes */ uint64_t lb_pad[7]; /* Payload */ l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; } l2arc_log_blk_phys_t; /* 64K total */ /* * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. */ _Static_assert(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), 1ULL << SPA_MINBLOCKSHIFT), "l2arc_log_blk_phys_t misaligned"); _Static_assert(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE, "l2arc_log_blk_phys_t too small"); _Static_assert(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE, "l2arc_log_blk_phys_t too big"); /* * These structures hold in-flight abd buffers for log blocks as they're being * written to the L2ARC device. */ typedef struct l2arc_lb_abd_buf { abd_t *abd; list_node_t node; } l2arc_lb_abd_buf_t; /* * These structures hold pointers to log blocks present on the L2ARC device. */ typedef struct l2arc_lb_ptr_buf { l2arc_log_blkptr_t *lb_ptr; list_node_t node; } l2arc_lb_ptr_buf_t; /* Macros for setting fields in le_prop and lbp_prop */ #define L2BLK_GET_LSIZE(field) \ BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) #define L2BLK_SET_LSIZE(field, x) \ BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) #define L2BLK_GET_PSIZE(field) \ BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) #define L2BLK_SET_PSIZE(field, x) \ BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) #define L2BLK_GET_COMPRESS(field) \ BF64_GET((field), 32, SPA_COMPRESSBITS) #define L2BLK_SET_COMPRESS(field, x) \ BF64_SET((field), 32, SPA_COMPRESSBITS, x) #define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) /* +/- 1 here are to keep compatibility after ARC_BUFC_INVALID removal. */ #define L2BLK_GET_TYPE(field) (BF64_GET((field), 48, 8) - 1) #define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, (x) + 1) #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) #define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) #define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x) #define PTR_SWAP(x, y) \ do { \ void *tmp = (x);\ x = y; \ y = tmp; \ } while (0) #define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ #define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ /* * L2ARC Internals */ typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ kmutex_t l2ad_mtx; /* lock for buffer list */ list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ zfs_refcount_t l2ad_alloc; /* allocated bytes */ /* * Persistence-related stuff */ l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ int l2ad_log_ent_idx; /* index into cur log blk */ /* Number of bytes in current log block's payload */ uint64_t l2ad_log_blk_payload_asize; /* * Offset (in bytes) of the first buffer in current log block's * payload. */ uint64_t l2ad_log_blk_payload_start; /* Flag indicating whether a rebuild is scheduled or is going on */ boolean_t l2ad_rebuild; boolean_t l2ad_rebuild_cancel; boolean_t l2ad_rebuild_began; uint64_t l2ad_log_entries; /* entries per log blk */ uint64_t l2ad_evict; /* evicted offset in bytes */ /* List of pointers to log blocks present in the L2ARC device */ list_t l2ad_lbptr_list; /* * Aligned size of all log blocks as accounted by vdev_space_update(). */ zfs_refcount_t l2ad_lb_asize; /* * Number of log blocks present on the device. */ zfs_refcount_t l2ad_lb_count; boolean_t l2ad_trim_all; /* TRIM whole device */ } l2arc_dev_t; /* * Encrypted blocks will need to be stored encrypted on the L2ARC * disk as they appear in the main pool. In order for this to work we * need to pass around the encryption parameters so they can be used * to write data to the L2ARC. This struct is only defined in the * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED * flag set. */ typedef struct arc_buf_hdr_crypt { abd_t *b_rabd; /* raw encrypted data */ /* dsobj for looking up encryption key for l2arc encryption */ uint64_t b_dsobj; dmu_object_type_t b_ot; /* object type */ /* encryption parameters */ uint8_t b_salt[ZIO_DATA_SALT_LEN]; uint8_t b_iv[ZIO_DATA_IV_LEN]; /* * Technically this could be removed since we will always be able to * get the mac from the bp when we need it. However, it is inconvenient * for callers of arc code to have to pass a bp in all the time. This * also allows us to assert that L2ARC data is properly encrypted to * match the data in the main storage pool. */ uint8_t b_mac[ZIO_DATA_MAC_LEN]; } arc_buf_hdr_crypt_t; typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ uint32_t b_hits; arc_state_type_t b_arcs_state; list_node_t b_l2node; } l2arc_buf_hdr_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ /* in-flight list of log blocks */ list_t l2wcb_abd_list; } l2arc_write_callback_t; struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; arc_buf_contents_t b_type; uint8_t b_complevel; uint8_t b_reserved1; /* used for 4 byte alignment */ uint16_t b_reserved2; /* used for 4 byte alignment */ arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; /* * This field stores the size of the data buffer after * compression, and is set in the arc's zio completion handlers. * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). * * While the block pointers can store up to 32MB in their psize * field, we can only store up to 32MB minus 512B. This is due * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. * a field of zeros represents 512B in the bp). We can't use a * bias of 1 since we need to reserve a psize of zero, here, to * represent holes and embedded blocks. * * This isn't a problem in practice, since the maximum size of a * buffer is limited to 16MB, so we never need to store 32MB in * this field. Even in the upstream illumos code base, the * maximum size of a buffer is limited to 16MB. */ uint16_t b_psize; /* * This field stores the size of the data buffer before * compression, and cannot change once set. It is in units * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) */ uint16_t b_lsize; /* immutable */ uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; /* L1ARC fields. Undefined when in l2arc_only state */ l1arc_buf_hdr_t b_l1hdr; /* * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED * is set and the L1 header exists. */ arc_buf_hdr_crypt_t b_crypt_hdr; }; typedef struct arc_stats { /* Number of requests that were satisfied without I/O. */ kstat_named_t arcstat_hits; /* Number of requests for which I/O was already running. */ kstat_named_t arcstat_iohits; /* Number of requests for which I/O has to be issued. */ kstat_named_t arcstat_misses; /* Same three, but specifically for demand data. */ kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_iohits; kstat_named_t arcstat_demand_data_misses; /* Same three, but specifically for demand metadata. */ kstat_named_t arcstat_demand_metadata_hits; kstat_named_t arcstat_demand_metadata_iohits; kstat_named_t arcstat_demand_metadata_misses; /* Same three, but specifically for prefetch data. */ kstat_named_t arcstat_prefetch_data_hits; kstat_named_t arcstat_prefetch_data_iohits; kstat_named_t arcstat_prefetch_data_misses; /* Same three, but specifically for prefetch metadata. */ kstat_named_t arcstat_prefetch_metadata_hits; kstat_named_t arcstat_prefetch_metadata_iohits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_uncached_hits; kstat_named_t arcstat_deleted; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held * by something using the same buffer, since hash locks are shared * by multiple buffers. */ kstat_named_t arcstat_mutex_miss; /* * Number of buffers skipped when updating the access state due to the * header having already been released after acquiring the hash lock. */ kstat_named_t arcstat_access_skip; /* * Number of buffers skipped because they have I/O in progress, are * indirect prefetch buffers that have not lived long enough, or are * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; /* * Number of times arc_evict_state() was unable to evict enough * buffers to reach its target amount. */ kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_eligible_mfu; kstat_named_t arcstat_evict_l2_eligible_mru; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; kstat_named_t arcstat_meta; kstat_named_t arcstat_pd; kstat_named_t arcstat_pm; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. * Note that the compressed bytes may match the uncompressed bytes * if the block is either not compressed or compressed arc is disabled. */ kstat_named_t arcstat_compressed_size; /* * Uncompressed size of the data stored in b_pabd. If compressed * arc is disabled then this value will be identical to the stat * above. */ kstat_named_t arcstat_uncompressed_size; /* * Number of bytes stored in all the arc_buf_t's. This is classified * as "overhead" since this data is typically short-lived and will * be evicted from the arc when it becomes unreferenced unless the * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level * values have been set (see comment in dbuf.c for more information). */ kstat_named_t arcstat_overhead_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually * backed by ARC buffers. This includes arc_buf_hdr_t structures * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only * caches), and arc_buf_t structures (allocated via arc_buf_t * cache). */ kstat_named_t arcstat_hdr_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_DATA. This is generally consumed by buffers backing * on disk user data (e.g. plain file contents). */ kstat_named_t arcstat_data_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_METADATA. This is generally consumed by buffers * backing on disk data that is used for internal ZFS * structures (e.g. ZAP, dnode, indirect blocks, etc). */ kstat_named_t arcstat_metadata_size; /* * Number of bytes consumed by dmu_buf_impl_t objects. */ kstat_named_t arcstat_dbuf_size; /* * Number of bytes consumed by dnode_t objects. */ kstat_named_t arcstat_dnode_size; /* * Number of bytes consumed by bonus buffers. */ kstat_named_t arcstat_bonus_size; #if defined(COMPAT_FREEBSD11) /* * Sum of the previous three counters, provided for compatibility. */ kstat_named_t arcstat_other_size; #endif /* * Total number of bytes consumed by ARC buffers residing in the * arc_anon state. This includes *all* buffers in the arc_anon * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_anon_size; kstat_named_t arcstat_anon_data; kstat_named_t arcstat_anon_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mru state. This includes *all* buffers in the arc_mru * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mru_size; kstat_named_t arcstat_mru_data; kstat_named_t arcstat_mru_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mru_ghost state. The key thing to note * here, is the fact that this size doesn't actually indicate * RAM consumption. The ghost lists only consist of headers and * don't actually have ARC buffers linked off of these headers. * Thus, *if* the headers had associated ARC buffers, these * buffers *would have* consumed this number of bytes. */ kstat_named_t arcstat_mru_ghost_size; kstat_named_t arcstat_mru_ghost_data; kstat_named_t arcstat_mru_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mfu state. This includes *all* buffers in the arc_mfu * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mfu_size; kstat_named_t arcstat_mfu_data; kstat_named_t arcstat_mfu_metadata; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu * state. */ kstat_named_t arcstat_mfu_evictable_data; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_METADATA, and reside in the * arc_mfu state. */ kstat_named_t arcstat_mfu_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mfu_ghost state. See the comment above * arcstat_mru_ghost_size for more details. */ kstat_named_t arcstat_mfu_ghost_size; kstat_named_t arcstat_mfu_ghost_data; kstat_named_t arcstat_mfu_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_metadata; /* * Total number of bytes that are going to be evicted from ARC due to * ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_size; kstat_named_t arcstat_uncached_data; kstat_named_t arcstat_uncached_metadata; /* * Number of data bytes that are going to be evicted from ARC due to * ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_evictable_data; /* * Number of metadata bytes that that are going to be evicted from ARC * due to ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; /* * Allocated size (in bytes) of L2ARC cached buffers by ARC state. */ kstat_named_t arcstat_l2_prefetch_asize; kstat_named_t arcstat_l2_mru_asize; kstat_named_t arcstat_l2_mfu_asize; /* * Allocated size (in bytes) of L2ARC cached buffers by buffer content * type. */ kstat_named_t arcstat_l2_bufc_data_asize; kstat_named_t arcstat_l2_bufc_metadata_asize; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_lsize; kstat_named_t arcstat_l2_psize; kstat_named_t arcstat_l2_hdr_size; /* * Number of L2ARC log blocks written. These are used for restoring the * L2ARC. Updated during writing of L2ARC log blocks. */ kstat_named_t arcstat_l2_log_blk_writes; /* * Moving average of the aligned size of the L2ARC log blocks, in * bytes. Updated during L2ARC rebuild and during writing of L2ARC * log blocks. */ kstat_named_t arcstat_l2_log_blk_avg_asize; /* Aligned size of L2ARC log blocks on L2ARC devices. */ kstat_named_t arcstat_l2_log_blk_asize; /* Number of L2ARC log blocks present on L2ARC devices. */ kstat_named_t arcstat_l2_log_blk_count; /* * Moving average of the aligned size of L2ARC restored data, in bytes, * to the aligned size of their metadata in L2ARC, in bytes. * Updated during L2ARC rebuild and during writing of L2ARC log blocks. */ kstat_named_t arcstat_l2_data_to_meta_ratio; /* * Number of times the L2ARC rebuild was successful for an L2ARC device. */ kstat_named_t arcstat_l2_rebuild_success; /* * Number of times the L2ARC rebuild failed because the device header * was in an unsupported format or corrupted. */ kstat_named_t arcstat_l2_rebuild_abort_unsupported; /* * Number of times the L2ARC rebuild failed because of IO errors * while reading a log block. */ kstat_named_t arcstat_l2_rebuild_abort_io_errors; /* * Number of times the L2ARC rebuild failed because of IO errors when * reading the device header. */ kstat_named_t arcstat_l2_rebuild_abort_dh_errors; /* * Number of L2ARC log blocks which failed to be restored due to * checksum errors. */ kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; /* * Number of times the L2ARC rebuild was aborted due to low system * memory. */ kstat_named_t arcstat_l2_rebuild_abort_lowmem; /* Logical size of L2ARC restored data, in bytes. */ kstat_named_t arcstat_l2_rebuild_size; /* Aligned size of L2ARC restored data, in bytes. */ kstat_named_t arcstat_l2_rebuild_asize; /* * Number of L2ARC log entries (buffers) that were successfully * restored in ARC. */ kstat_named_t arcstat_l2_rebuild_bufs; /* * Number of L2ARC log entries (buffers) already cached in ARC. These * were not restored again. */ kstat_named_t arcstat_l2_rebuild_bufs_precached; /* * Number of L2ARC log blocks that were restored successfully. Each * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. */ kstat_named_t arcstat_l2_rebuild_log_blks; kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_memory_direct_count; kstat_named_t arcstat_memory_indirect_count; kstat_named_t arcstat_memory_all_bytes; kstat_named_t arcstat_memory_free_bytes; kstat_named_t arcstat_memory_available_bytes; kstat_named_t arcstat_no_grow; kstat_named_t arcstat_tempreserve; kstat_named_t arcstat_loaned_bytes; kstat_named_t arcstat_prune; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_dnode_limit; kstat_named_t arcstat_async_upgrade_sync; /* Number of predictive prefetch requests. */ kstat_named_t arcstat_predictive_prefetch; /* Number of requests for which predictive prefetch has completed. */ kstat_named_t arcstat_demand_hit_predictive_prefetch; /* Number of requests for which predictive prefetch was running. */ kstat_named_t arcstat_demand_iohit_predictive_prefetch; /* Number of prescient prefetch requests. */ kstat_named_t arcstat_prescient_prefetch; /* Number of requests for which prescient prefetch has completed. */ kstat_named_t arcstat_demand_hit_prescient_prefetch; /* Number of requests for which prescient prefetch was running. */ kstat_named_t arcstat_demand_iohit_prescient_prefetch; kstat_named_t arcstat_need_free; kstat_named_t arcstat_sys_free; kstat_named_t arcstat_raw_size; kstat_named_t arcstat_cached_only_in_progress; kstat_named_t arcstat_abd_chunk_waste_size; } arc_stats_t; typedef struct arc_sums { wmsum_t arcstat_hits; wmsum_t arcstat_iohits; wmsum_t arcstat_misses; wmsum_t arcstat_demand_data_hits; wmsum_t arcstat_demand_data_iohits; wmsum_t arcstat_demand_data_misses; wmsum_t arcstat_demand_metadata_hits; wmsum_t arcstat_demand_metadata_iohits; wmsum_t arcstat_demand_metadata_misses; wmsum_t arcstat_prefetch_data_hits; wmsum_t arcstat_prefetch_data_iohits; wmsum_t arcstat_prefetch_data_misses; wmsum_t arcstat_prefetch_metadata_hits; wmsum_t arcstat_prefetch_metadata_iohits; wmsum_t arcstat_prefetch_metadata_misses; wmsum_t arcstat_mru_hits; wmsum_t arcstat_mru_ghost_hits; wmsum_t arcstat_mfu_hits; wmsum_t arcstat_mfu_ghost_hits; wmsum_t arcstat_uncached_hits; wmsum_t arcstat_deleted; wmsum_t arcstat_mutex_miss; wmsum_t arcstat_access_skip; wmsum_t arcstat_evict_skip; wmsum_t arcstat_evict_not_enough; wmsum_t arcstat_evict_l2_cached; wmsum_t arcstat_evict_l2_eligible; wmsum_t arcstat_evict_l2_eligible_mfu; wmsum_t arcstat_evict_l2_eligible_mru; wmsum_t arcstat_evict_l2_ineligible; wmsum_t arcstat_evict_l2_skip; wmsum_t arcstat_hash_collisions; wmsum_t arcstat_hash_chains; aggsum_t arcstat_size; wmsum_t arcstat_compressed_size; wmsum_t arcstat_uncompressed_size; wmsum_t arcstat_overhead_size; wmsum_t arcstat_hdr_size; wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; wmsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; wmsum_t arcstat_l2_prefetch_asize; wmsum_t arcstat_l2_mru_asize; wmsum_t arcstat_l2_mfu_asize; wmsum_t arcstat_l2_bufc_data_asize; wmsum_t arcstat_l2_bufc_metadata_asize; wmsum_t arcstat_l2_feeds; wmsum_t arcstat_l2_rw_clash; wmsum_t arcstat_l2_read_bytes; wmsum_t arcstat_l2_write_bytes; wmsum_t arcstat_l2_writes_sent; wmsum_t arcstat_l2_writes_done; wmsum_t arcstat_l2_writes_error; wmsum_t arcstat_l2_writes_lock_retry; wmsum_t arcstat_l2_evict_lock_retry; wmsum_t arcstat_l2_evict_reading; wmsum_t arcstat_l2_evict_l1cached; wmsum_t arcstat_l2_free_on_write; wmsum_t arcstat_l2_abort_lowmem; wmsum_t arcstat_l2_cksum_bad; wmsum_t arcstat_l2_io_error; wmsum_t arcstat_l2_lsize; wmsum_t arcstat_l2_psize; aggsum_t arcstat_l2_hdr_size; wmsum_t arcstat_l2_log_blk_writes; wmsum_t arcstat_l2_log_blk_asize; wmsum_t arcstat_l2_log_blk_count; wmsum_t arcstat_l2_rebuild_success; wmsum_t arcstat_l2_rebuild_abort_unsupported; wmsum_t arcstat_l2_rebuild_abort_io_errors; wmsum_t arcstat_l2_rebuild_abort_dh_errors; wmsum_t arcstat_l2_rebuild_abort_cksum_lb_errors; wmsum_t arcstat_l2_rebuild_abort_lowmem; wmsum_t arcstat_l2_rebuild_size; wmsum_t arcstat_l2_rebuild_asize; wmsum_t arcstat_l2_rebuild_bufs; wmsum_t arcstat_l2_rebuild_bufs_precached; wmsum_t arcstat_l2_rebuild_log_blks; wmsum_t arcstat_memory_throttle_count; wmsum_t arcstat_memory_direct_count; wmsum_t arcstat_memory_indirect_count; wmsum_t arcstat_prune; wmsum_t arcstat_meta_used; wmsum_t arcstat_async_upgrade_sync; wmsum_t arcstat_predictive_prefetch; wmsum_t arcstat_demand_hit_predictive_prefetch; wmsum_t arcstat_demand_iohit_predictive_prefetch; wmsum_t arcstat_prescient_prefetch; wmsum_t arcstat_demand_hit_prescient_prefetch; wmsum_t arcstat_demand_iohit_prescient_prefetch; wmsum_t arcstat_raw_size; wmsum_t arcstat_cached_only_in_progress; wmsum_t arcstat_abd_chunk_waste_size; } arc_sums_t; typedef struct arc_evict_waiter { list_node_t aew_node; kcondvar_t aew_cv; uint64_t aew_count; } arc_evict_waiter_t; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ wmsum_add(&arc_sums.stat, (val)) #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ #define arc_meta ARCSTAT(arcstat_meta) /* target frac of metadata */ #define arc_pd ARCSTAT(arcstat_pd) /* target frac of data MRU */ #define arc_pm ARCSTAT(arcstat_pm) /* target frac of meta MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ #define arc_anon (&ARC_anon) #define arc_mru (&ARC_mru) #define arc_mru_ghost (&ARC_mru_ghost) #define arc_mfu (&ARC_mfu) #define arc_mfu_ghost (&ARC_mfu_ghost) #define arc_l2c_only (&ARC_l2c_only) #define arc_uncached (&ARC_uncached) extern taskq_t *arc_prune_taskq; extern arc_stats_t arc_stats; extern arc_sums_t arc_sums; extern hrtime_t arc_growtime; extern boolean_t arc_warm; extern uint_t arc_grow_retry; extern uint_t arc_no_grow_shift; extern uint_t arc_shrink_shift; extern kmutex_t arc_prune_mtx; extern list_t arc_prune_list; extern arc_state_t ARC_mfu; extern arc_state_t ARC_mru; extern uint_t zfs_arc_pc_percent; extern uint_t arc_lotsfree_percent; extern uint64_t zfs_arc_min; extern uint64_t zfs_arc_max; extern void arc_reduce_target_size(int64_t to_free); extern boolean_t arc_reclaim_needed(void); extern void arc_kmem_reap_soon(void); extern void arc_wait_for_eviction(uint64_t, boolean_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); -extern void arc_prune_async(uint64_t); extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); extern uint64_t arc_free_memory(void); extern int64_t arc_available_memory(void); extern void arc_tuning_update(boolean_t); extern void arc_register_hotplug(void); extern void arc_unregister_hotplug(void); extern int param_set_arc_u64(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS); /* used in zdb.c */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); /* used in vdev_trim.c */ void l2arc_dev_hdr_update(l2arc_dev_t *dev); l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); #ifdef __cplusplus } #endif #endif /* _SYS_ARC_IMPL_H */ diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h index b90855687411..87ddbd90e170 100644 --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -1,1237 +1,1237 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Datto Inc. */ #ifndef _SYS_SPA_H #define _SYS_SPA_H #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Forward references that lots of things need. */ typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct metaslab_group metaslab_group_t; typedef struct metaslab_class metaslab_class_t; typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; typedef struct zbookmark_phys zbookmark_phys_t; typedef struct zbookmark_err_phys zbookmark_err_phys_t; struct bpobj; struct bplist; struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; /* * Alignment Shift (ashift) is an immutable, internal top-level vdev property * which can only be set at vdev creation time. Physical writes are always done * according to it, which makes 2^ashift the smallest possible IO on a vdev. * * We currently allow values ranging from 512 bytes (2^9 = 512) to 64 KiB * (2^16 = 65,536). */ #define ASHIFT_MIN 9 #define ASHIFT_MAX 16 /* * Size of block to hold the configuration data (a packed nvlist) */ #define SPA_CONFIG_BLOCKSIZE (1ULL << 14) /* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. * The ASIZE encoding should be at least 64 times larger (6 more bits) * to support up to 4-way RAID-Z mirror mode with worst-case gang block * overhead, three DVAs per bp, plus one more bit in case we do anything * else that expands the ASIZE. */ #define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ #define SPA_COMPRESSBITS 7 #define SPA_VDEVBITS 24 #define SPA_COMPRESSMASK ((1U << SPA_COMPRESSBITS) - 1) /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. */ typedef struct dva { uint64_t dva_word[2]; } dva_t; /* * Some checksums/hashes need a 256-bit initialization salt. This salt is kept * secret and is suitable for use in MAC algorithms as the key. */ typedef struct zio_cksum_salt { uint8_t zcs_bytes[32]; } zio_cksum_salt_t; /* * Each block is described by its DVAs, time of birth, checksum, etc. * The word-by-word, bit-by-bit layout of the blkptr is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | pad | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | pad | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | pad | vdev3 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | checksum[2] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | checksum[3] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * vdev virtual device ID * offset offset into virtual device * LSIZE logical size * PSIZE physical size (after compression) * ASIZE allocated size (including RAID-Z parity and gang block headers) * GRID RAID-Z layout information (reserved for future use) * cksum checksum function * comp compression function * G gang block indicator * B byteorder (endianness) * D dedup * X encryption * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type * phys birth txg when dva[0] was written; zero if same as logical birth txg * note that typically all the dva's would be written in this * txg, but they could be different if they were moved by * device removal. * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ /* * The blkptr_t's of encrypted blocks also need to store the encryption * parameters so that the block can be decrypted. This layout is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | salt | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 | IV1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | IV2 | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | MAC[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | MAC[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * salt Salt for generating encryption keys * IV1 First 64 bits of encryption IV * X Block requires encryption handling (set to 1) * E blkptr_t contains embedded data (set to 0, see below) * fill count number of non-zero blocks under this bp (truncated to 32 bits) * IV2 Last 32 bits of encryption IV * checksum[2] 128-bit checksum of the data this bp describes * MAC[2] 128-bit message authentication code for this data * * The X bit being set indicates that this block is one of 3 types. If this is * a level 0 block with an encrypted object type, the block is encrypted * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted * object type, this block is authenticated with an HMAC (see * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC * words to store a checksum-of-MACs from the level below (see * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED() * refers to both encrypted and authenticated blocks and BP_USES_CRYPT() * refers to any of these 3 kinds of blocks. * * The additional encryption parameters are the salt, IV, and MAC which are * explained in greater detail in the block comment at the top of zio_crypt.c. * The MAC occupies half of the checksum space since it serves a very similar * purpose: to prevent data corruption on disk. The only functional difference * is that the checksum is used to detect on-disk corruption whether or not the * encryption key is loaded and the MAC provides additional protection against * malicious disk tampering. We use the 3rd DVA to store the salt and first * 64 bits of the IV. As a result encrypted blocks can only have 2 copies * maximum instead of the normal 3. The last 32 bits of the IV are stored in * the upper bits of what is usually the fill count. Note that only blocks at * level 0 or -2 are ever encrypted, which allows us to guarantee that these * 32 bits are not trampled over by other code (see zio_crypt.c for details). * The salt and IV are not used for authenticated bps or bps with an indirect * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits * for the fill count. */ /* * "Embedded" blkptr_t's don't actually point to a block, instead they * have a data payload embedded in the blkptr_t itself. See the comment * in blkptr.c for more details. * * The blkptr_t is laid out as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | payload | * 1 | payload | * 2 | payload | * 3 | payload | * 4 | payload | * 5 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | payload | * 8 | payload | * 9 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | payload | * c | payload | * d | payload | * e | payload | * f | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * payload contains the embedded data * B (byteorder) byteorder (endianness) * D (dedup) padding (set to zero) * X encryption (set to zero) * E (embedded) set to one * lvl indirection level * type DMU object type * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) * comp compression function of payload * PSIZE size of payload after compression, in bytes * LSIZE logical size of payload, in bytes * note that 25 bits is enough to store the largest * "normal" BP's LSIZE (2^16 * 2^9) in bytes * log. birth transaction group in which the block was logically born * * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded * bp's they are stored in units of SPA_MINBLOCKSHIFT. * Generally, the generic BP_GET_*() macros can be used on embedded BP's. * The B, D, X, lvl, type, and comp fields are stored the same as with normal * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before * other macros, as they assert that they are only used on BP's of the correct * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use * the payload space for encryption parameters (see the comment above on * how encryption parameters are stored). */ #define BPE_GET_ETYPE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET((bp)->blk_prop, 40, 8)) #define BPE_SET_ETYPE(bp, t) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, t); \ } while (0) #define BPE_GET_LSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) #define BPE_SET_LSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ } while (0) #define BPE_GET_PSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) #define BPE_SET_PSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ } while (0) typedef enum bp_embedded_type { BP_EMBEDDED_TYPE_DATA, BP_EMBEDDED_TYPE_RESERVED, /* Reserved for Delphix byteswap feature. */ BP_EMBEDDED_TYPE_REDACTED, NUM_BP_EMBEDDED_TYPES } bp_embedded_type_t; #define BPE_NUM_WORDS 14 #define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) #define BPE_IS_PAYLOADWORD(bp, wp) \ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ #define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ /* * A block is a hole when it has either 1) never been written to, or * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads * without physically allocating disk space. Holes are represented in the * blkptr_t structure by zeroed blk_dva. Correct checking for holes is * done through the BP_IS_HOLE macro. For holes, the logical size, level, * DMU object type, and birth times are all also stored for holes that * were written to at some point (i.e. were punched after having been filled). */ typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ uint64_t blk_pad[2]; /* Extra space for the future */ uint64_t blk_phys_birth; /* txg when block was allocated */ uint64_t blk_birth; /* transaction group at birth */ uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ } blkptr_t; /* * Macros to get and set fields in a bp or DVA. */ /* * Note, for gang blocks, DVA_GET_ASIZE() is the total space allocated for * this gang DVA including its children BP's. The space allocated at this * DVA's vdev/offset is vdev_gang_header_asize(vdev). */ #define DVA_GET_ASIZE(dva) \ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_ASIZE(dva, x) \ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) #define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) #define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) #define DVA_SET_VDEV(dva, x) \ BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) #define DVA_GET_OFFSET(dva) \ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_OFFSET(dva, x) \ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? \ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_LSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ } while (0) #define BP_GET_PSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_PSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ } while (0) #define BP_GET_COMPRESS(bp) \ BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) #define BP_SET_COMPRESS(bp, x) \ BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) #define BP_GET_CHECKSUM(bp) \ (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ BF64_GET((bp)->blk_prop, 40, 8)) #define BP_SET_CHECKSUM(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, x); \ } while (0) #define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) #define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) /* encrypted, authenticated, and MAC cksum bps use the same bit */ #define BP_USES_CRYPT(bp) BF64_GET((bp)->blk_prop, 61, 1) #define BP_SET_CRYPT(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) #define BP_IS_ENCRYPTED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_IS_AUTHENTICATED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ !DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_HAS_INDIRECT_MAC_CKSUM(bp) \ (BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0) #define BP_IS_PROTECTED(bp) \ (BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp)) #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) #define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) #define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) #define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) #define BP_PHYSICAL_BIRTH(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) #define BP_SET_BIRTH(bp, logical, physical) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ (bp)->blk_birth = (logical); \ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ } #define BP_GET_FILL(bp) \ ((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \ ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)) #define BP_SET_FILL(bp, fill) \ { \ if (BP_IS_ENCRYPTED(bp)) \ BF64_SET((bp)->blk_fill, 0, 32, fill); \ else \ (bp)->blk_fill = fill; \ } #define BP_GET_IV2(bp) \ (ASSERT(BP_IS_ENCRYPTED(bp)), \ BF64_GET((bp)->blk_fill, 32, 32)) #define BP_SET_IV2(bp, iv2) \ { \ ASSERT(BP_IS_ENCRYPTED(bp)); \ BF64_SET((bp)->blk_fill, 32, 32, iv2); \ } #define BP_IS_METADATA(bp) \ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_GET_UCSIZE(bp) \ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_COUNT_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ DVA_GET_GANG(&(bp)->blk_dva[1]) + \ (DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))) #define DVA_EQUAL(dva1, dva2) \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ (bp1)->blk_birth == (bp2)->blk_birth && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) #define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) #define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) #define BP_IS_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) #define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ (dva)->dva_word[1] == 0ULL) #define BP_IS_HOLE(bp) \ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) #define BP_SET_REDACTED(bp) \ { \ BP_SET_EMBEDDED(bp, B_TRUE); \ BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED); \ } #define BP_IS_REDACTED(bp) \ (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED) /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) #define BP_ZERO(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ (bp)->blk_dva[1].dva_word[0] = 0; \ (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ (bp)->blk_phys_birth = 0; \ (bp)->blk_birth = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } #ifdef _ZFS_BIG_ENDIAN #define ZFS_HOST_BYTEORDER (0ULL) #else #define ZFS_HOST_BYTEORDER (1ULL) #endif #define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) #define BP_SPRINTF_LEN 400 /* * This macro allows code sharing between zfs, libzpool, and mdb. * 'func' is either kmem_scnprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ #define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *const copyname[] = \ { "zero", "single", "double", "triple" }; \ int len = 0; \ int copies = 0; \ const char *crypt_type; \ if (bp != NULL) { \ if (BP_IS_ENCRYPTED(bp)) { \ crypt_type = "encrypted"; \ /* LINTED E_SUSPICIOUS_COMPARISON */ \ } else if (BP_IS_AUTHENTICATED(bp)) { \ crypt_type = "authenticated"; \ } else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { \ crypt_type = "indirect-MAC"; \ } else { \ crypt_type = "unencrypted"; \ } \ } \ if (bp == NULL) { \ len += func(buf + len, size - len, ""); \ } else if (BP_IS_HOLE(bp)) { \ len += func(buf + len, size - len, \ "HOLE [L%llu %s] " \ "size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else if (BP_IS_EMBEDDED(bp)) { \ len = func(buf + len, size - len, \ "EMBEDDED [L%llu %s] et=%u %s " \ "size=%llxL/%llxP birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (int)BPE_GET_ETYPE(bp), \ compress, \ (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else if (BP_IS_REDACTED(bp)) { \ len += func(buf + len, size - len, \ "REDACTED [L%llu %s] size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ if (DVA_IS_VALID(dva)) \ copies++; \ len += func(buf + len, size - len, \ "DVA[%d]=<%llu:%llx:%llx>%c", d, \ (u_longlong_t)DVA_GET_VDEV(dva), \ (u_longlong_t)DVA_GET_OFFSET(dva), \ (u_longlong_t)DVA_GET_ASIZE(dva), \ ws); \ } \ ASSERT3S(copies, >, 0); \ if (BP_IS_ENCRYPTED(bp)) { \ len += func(buf + len, size - len, \ "salt=%llx iv=%llx:%llx%c", \ (u_longlong_t)bp->blk_dva[2].dva_word[0], \ (u_longlong_t)bp->blk_dva[2].dva_word[1], \ (u_longlong_t)BP_GET_IV2(bp), \ ws); \ } \ if (BP_IS_GANG(bp) && \ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ copies--; \ len += func(buf + len, size - len, \ "[L%llu %s] %s %s %s %s %s %s %s%c" \ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ "cksum=%016llx:%016llx:%016llx:%016llx", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ checksum, \ compress, \ crypt_type, \ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ BP_IS_GANG(bp) ? "gang" : "contiguous", \ BP_GET_DEDUP(bp) ? "dedup" : "unique", \ copyname[copies], \ ws, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth, \ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ (u_longlong_t)bp->blk_cksum.zc_word[1], \ (u_longlong_t)bp->blk_cksum.zc_word[2], \ (u_longlong_t)bp->blk_cksum.zc_word[3]); \ } \ ASSERT(len < size); \ } #define BP_GET_BUFC_TYPE(bp) \ (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, SPA_IMPORT_ASSEMBLE } spa_import_type_t; typedef enum spa_mode { SPA_MODE_UNINIT = 0, SPA_MODE_READ = 1, SPA_MODE_WRITE = 2, } spa_mode_t; /* * Send TRIM commands in-line during normal pool operation while deleting. * OFF: no * ON: yes */ typedef enum { SPA_AUTOTRIM_OFF = 0, /* default */ SPA_AUTOTRIM_ON, } spa_autotrim_t; /* * Reason TRIM command was issued, used internally for accounting purposes. */ typedef enum trim_type { TRIM_TYPE_MANUAL = 0, TRIM_TYPE_AUTO = 1, TRIM_TYPE_SIMPLE = 2 } trim_type_t; /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, const void *tag); extern int spa_open_rewind(const char *pool, spa_t **, const void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, struct dsl_crypto_params *dcp); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(const char *pool); extern int spa_checkpoint(const char *pool); extern int spa_checkpoint_discard(const char *pool); extern int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce); extern int spa_reset(const char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern int spa_async_tasks(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 #define SPA_ASYNC_PROBE 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 #define SPA_ASYNC_INITIALIZE_RESTART 0x100 #define SPA_ASYNC_TRIM_RESTART 0x200 #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 #define SPA_ASYNC_L2CACHE_REBUILD 0x800 #define SPA_ASYNC_L2CACHE_TRIM 0x1000 #define SPA_ASYNC_REBUILD_DONE 0x2000 #define SPA_ASYNC_DETACH_SPARE 0x4000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_alloc(spa_t *spa, uint64_t guid); extern int spa_vdev_noalloc(spa_t *spa, uint64_t guid); extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist); extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); extern void spa_spare_remove(vdev_t *vd); extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); extern void spa_spare_activate(vdev_t *vd); /* L2ARC state (which is global across all pools) */ extern void spa_l2cache_add(vdev_t *vd); extern void spa_l2cache_remove(vdev_t *vd); extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); /* scanning */ extern int spa_scan(spa_t *spa, pool_scan_func_t func); extern int spa_scan_stop(spa_t *spa); extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); extern uint_t zfs_sync_pass_deferred_free; /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; /* * SPA configuration functions in spa_config.c */ #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern void spa_config_load(void); -extern nvlist_t *spa_all_configs(uint64_t *); +extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype); /* * Miscellaneous SPA routines in spa_misc.c */ /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, const void *tag); extern void spa_close(spa_t *spa, const void *tag); extern void spa_async_close(spa_t *spa, const void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ #define SCL_ALLOC 0x08 #define SCL_ZIO 0x10 #define SCL_FREE 0x20 #define SCL_VDEV 0x40 #define SCL_LOCKS 7 #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) /* Historical pool statistics */ typedef struct spa_history_kstat { kmutex_t lock; uint64_t count; uint64_t size; kstat_t *kstat; void *priv; list_t list; } spa_history_kstat_t; typedef struct spa_history_list { uint64_t size; procfs_list_t procfs_list; } spa_history_list_t; typedef struct spa_stats { spa_history_list_t read_history; spa_history_list_t txg_history; spa_history_kstat_t tx_assign_histogram; spa_history_list_t mmp_history; spa_history_kstat_t state; /* pool state */ spa_history_kstat_t guid; /* pool guid */ spa_history_kstat_t iostats; } spa_stats_t; typedef enum txg_state { TXG_STATE_BIRTH = 0, TXG_STATE_OPEN = 1, TXG_STATE_QUIESCED = 2, TXG_STATE_WAIT_FOR_SYNC = 3, TXG_STATE_SYNCED = 4, TXG_STATE_COMMITTED = 5, } txg_state_t; typedef struct txg_stat { vdev_stat_t vs1; vdev_stat_t vs2; uint64_t txg; uint64_t ndirty; } txg_stat_t; /* Assorted pool IO kstats */ typedef struct spa_iostats { kstat_named_t trim_extents_written; kstat_named_t trim_bytes_written; kstat_named_t trim_extents_skipped; kstat_named_t trim_bytes_skipped; kstat_named_t trim_extents_failed; kstat_named_t trim_bytes_failed; kstat_named_t autotrim_extents_written; kstat_named_t autotrim_bytes_written; kstat_named_t autotrim_extents_skipped; kstat_named_t autotrim_bytes_skipped; kstat_named_t autotrim_extents_failed; kstat_named_t autotrim_bytes_failed; kstat_named_t simple_trim_extents_written; kstat_named_t simple_trim_bytes_written; kstat_named_t simple_trim_extents_skipped; kstat_named_t simple_trim_bytes_skipped; kstat_named_t simple_trim_extents_failed; kstat_named_t simple_trim_bytes_failed; } spa_iostats_t; extern void spa_stats_init(spa_t *spa); extern void spa_stats_destroy(spa_t *spa); extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags); extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); extern int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time); extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t, struct dsl_pool *); extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *); extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id); extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, hrtime_t duration); extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id, int error); extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, uint64_t mmp_sec_remaining); extern int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t max_txg); extern int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t spa_load_state); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, const void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, const char *tag); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); /* Log state */ typedef enum spa_log_state { SPA_LOG_UNKNOWN = 0, /* unknown log state */ SPA_LOG_MISSING, /* missing log(s) */ SPA_LOG_CLEAR, /* clear the log(s) */ SPA_LOG_GOOD, /* log(s) are good */ } spa_log_state_t; extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); extern int spa_reset_logs(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); extern void spa_deadman(void *); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern boolean_t spa_is_initializing(spa_t *spa); extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); extern uint32_t spa_sync_pass(spa_t *spa); extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_load_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern uint64_t spa_get_failmode(spa_t *spa); extern uint64_t spa_get_deadman_failmode(spa_t *spa); extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode); extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); extern space_map_t *spa_syncing_log_sm(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); extern uint64_t spa_deadman_ziotime(spa_t *spa); extern uint64_t spa_dirty_data(spa_t *spa); extern spa_autotrim_t spa_get_autotrim(spa_t *spa); /* Miscellaneous support routines */ extern void spa_load_failed(spa_t *spa, const char *fmt, ...) __attribute__((format(printf, 2, 3))); extern void spa_load_note(spa_t *spa, const char *fmt, ...) __attribute__((format(printf, 2, 3))); extern void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx); extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_generate_guid(spa_t *spa); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern int spa_change_guid(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_l2cache(spa_t *, uint64_t guid); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); extern boolean_t spa_has_checkpoint(spa_t *spa); extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); extern boolean_t spa_suspend_async_destroy(spa_t *spa); extern uint64_t spa_min_claim_txg(spa_t *spa); extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp); typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, void *arg); extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg); extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); extern uint64_t spa_total_metaslabs(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern boolean_t spa_livelist_delete_check(spa_t *spa); extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); extern char *spa_his_ievent_table[]; extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf); extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); extern void spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx); extern void spa_history_log_internal(spa_t *spa, const char *operation, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth); extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, const uint64_t *birth); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, zio_t *zio); extern void zfs_ereport_taskq_fini(void); extern void zfs_ereport_clear(spa_t *spa, vdev_t *vd); extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_approx_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count); extern uint64_t spa_get_last_errlog_size(spa_t *spa); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); extern void spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx); extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, dmu_tx_t *tx); extern void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx); extern void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx); extern int find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t *top_affected_fs); extern int find_birth_txg(struct dsl_dataset *ds, zbookmark_err_phys_t *zep, uint64_t *birth_txg); extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb); extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep); /* vdev mirror */ extern void vdev_mirror_stat_init(void); extern void vdev_mirror_stat_fini(void); /* Initialization and termination */ extern void spa_init(spa_mode_t mode); extern void spa_fini(void); extern void spa_boot_init(void); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, const char *name); extern void zfs_ereport_zvol_post(const char *subclass, const char *name, const char *device_name, const char *raw_name); /* waiting for pool activities to complete */ extern int spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited); extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited); extern void spa_notify_waiters(spa_t *spa); extern void spa_wake_waiters(spa_t *spa); extern void spa_import_os(spa_t *spa); extern void spa_export_os(spa_t *spa); extern void spa_activate_os(spa_t *spa); extern void spa_deactivate_os(spa_t *spa); /* module param call functions */ int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS); int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ } while (0) #else #define dprintf_bp(bp, fmt, ...) #endif extern spa_mode_t spa_mode_global; extern int zfs_deadman_enabled; extern uint64_t zfs_deadman_synctime_ms; extern uint64_t zfs_deadman_ziotime_ms; extern uint64_t zfs_deadman_checktime_ms; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; #ifdef __cplusplus } #endif #endif /* _SYS_SPA_H */ diff --git a/sys/contrib/openzfs/include/sys/txg_impl.h b/sys/contrib/openzfs/include/sys/txg_impl.h index 45fde2e1f351..8ab7969b25be 100644 --- a/sys/contrib/openzfs/include/sys/txg_impl.h +++ b/sys/contrib/openzfs/include/sys/txg_impl.h @@ -1,125 +1,124 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_TXG_IMPL_H #define _SYS_TXG_IMPL_H #include #include #ifdef __cplusplus extern "C" { #endif /* * The tx_cpu structure is a per-cpu structure that is used to track * the number of active transaction holds (tc_count). As transactions * are assigned into a transaction group the appropriate tc_count is * incremented to indicate that there are pending changes that have yet * to quiesce. Consumers eventually call txg_rele_to_sync() to decrement * the tc_count. A transaction group is not considered quiesced until all * tx_cpu structures have reached a tc_count of zero. * * This structure is a per-cpu structure by design. Updates to this structure * are frequent and concurrent. Having a single structure would result in * heavy lock contention so a per-cpu design was implemented. With the fanned * out mutex design, consumers only need to lock the mutex associated with * thread's cpu. * * The tx_cpu contains two locks, the tc_lock and tc_open_lock. * The tc_lock is used to protect all members of the tx_cpu structure with * the exception of the tc_open_lock. This lock should only be held for a * short period of time, typically when updating the value of tc_count. * * The tc_open_lock protects the tx_open_txg member of the tx_state structure. * This lock is used to ensure that transactions are only assigned into * the current open transaction group. In order to move the current open * transaction group to the quiesce phase, the txg_quiesce thread must * grab all tc_open_locks, increment the tx_open_txg, and drop the locks. * The tc_open_lock is held until the transaction is assigned into the * transaction group. Typically, this is a short operation but if throttling * is occurring it may be held for longer periods of time. */ struct tx_cpu { kmutex_t tc_open_lock; /* protects tx_open_txg */ kmutex_t tc_lock; /* protects the rest of this struct */ kcondvar_t tc_cv[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ - char tc_pad[8]; /* pad to fill 3 cache lines */ -}; +} ____cacheline_aligned; /* * The tx_state structure maintains the state information about the different * stages of the pool's transaction groups. A per pool tx_state structure * is used to track this information. The tx_state structure also points to * an array of tx_cpu structures (described above). Although the tx_sync_lock * is used to protect the members of this structure, it is not used to * protect the tx_open_txg. Instead a special lock in the tx_cpu structure * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock. * Any thread wishing to update tx_open_txg must grab the tc_open_lock on * every cpu (see txg_quiesce()). */ typedef struct tx_state { tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */ kmutex_t tx_sync_lock; /* protects the rest of this struct */ uint64_t tx_open_txg; /* currently open txg id */ uint64_t tx_quiescing_txg; /* currently quiescing txg id */ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ uint64_t tx_syncing_txg; /* currently syncing txg id */ uint64_t tx_synced_txg; /* last synced txg id */ hrtime_t tx_open_time; /* start time of tx_open_txg */ uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */ kcondvar_t tx_sync_more_cv; kcondvar_t tx_sync_done_cv; kcondvar_t tx_quiesce_more_cv; kcondvar_t tx_quiesce_done_cv; kcondvar_t tx_timeout_cv; kcondvar_t tx_exit_cv; /* wait for all threads to exit */ uint8_t tx_threads; /* number of threads */ uint8_t tx_exiting; /* set when we're exiting */ kthread_t *tx_sync_thread; kthread_t *tx_quiesce_thread; taskq_t *tx_commit_cb_taskq; /* commit callback taskq */ } tx_state_t; #ifdef __cplusplus } #endif #endif /* _SYS_TXG_IMPL_H */ diff --git a/sys/contrib/openzfs/include/sys/vdev_impl.h b/sys/contrib/openzfs/include/sys/vdev_impl.h index ad9dc3aefd8e..3f2312c23438 100644 --- a/sys/contrib/openzfs/include/sys/vdev_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_impl.h @@ -1,649 +1,652 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_VDEV_IMPL_H #define _SYS_VDEV_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Virtual device descriptors. * * All storage pool operations go through the virtual device framework, * which provides data replication and I/O scheduling. */ /* * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; struct abd; extern uint_t zfs_vdev_queue_depth_pct; extern uint_t zfs_vdev_def_queue_depth; extern uint_t zfs_vdev_async_write_max_active; /* * Virtual device operations */ typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd); typedef void vdev_kobj_post_evt_func_t(vdev_t *vd); typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg); typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, vdev_remap_cb_t callback, void *arg); /* * Given a target vdev, translates the logical range "in" to the physical * range "res" */ typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *logical, range_seg64_t *physical, range_seg64_t *remain); typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start, uint64_t size, uint64_t max_segment); typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp, uint64_t *sizep); typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv); typedef uint64_t vdev_nparity_func_t(vdev_t *vd); typedef uint64_t vdev_ndisks_func_t(vdev_t *vd); typedef const struct vdev_ops { vdev_init_func_t *vdev_op_init; vdev_fini_func_t *vdev_op_fini; vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; vdev_asize_func_t *vdev_op_asize; vdev_min_asize_func_t *vdev_op_min_asize; vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; vdev_need_resilver_func_t *vdev_op_need_resilver; vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; vdev_xlation_func_t *vdev_op_xlate; vdev_rebuild_asize_func_t *vdev_op_rebuild_asize; vdev_metaslab_init_func_t *vdev_op_metaslab_init; vdev_config_generate_func_t *vdev_op_config_generate; vdev_nparity_func_t *vdev_op_nparity; vdev_ndisks_func_t *vdev_op_ndisks; vdev_kobj_post_evt_func_t *vdev_op_kobj_evt_post; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; /* * Virtual device properties */ typedef union vdev_queue_class { - list_t vqc_list; + struct { + ulong_t vqc_list_numnodes; + list_t vqc_list; + }; avl_tree_t vqc_tree; } vdev_queue_class_t; struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; zio_priority_t vq_last_prio; /* Last sent I/O priority. */ uint32_t vq_cqueued; /* Classes with queued I/Os. */ uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE]; uint32_t vq_active; /* Number of active I/Os. */ uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ list_t vq_active_list; /* List of active I/Os. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ kmutex_t vq_lock; }; typedef enum vdev_alloc_bias { VDEV_BIAS_NONE, VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ } vdev_alloc_bias_t; /* * On-disk indirect vdev state. * * An indirect vdev is described exclusively in the MOS config of a pool. * The config for an indirect vdev includes several fields, which are * accessed in memory by a vdev_indirect_config_t. */ typedef struct vdev_indirect_config { /* * Object (in MOS) which contains the indirect mapping. This object * contains an array of vdev_indirect_mapping_entry_phys_t ordered by * vimep_src. The bonus buffer for this object is a * vdev_indirect_mapping_phys_t. This object is allocated when a vdev * removal is initiated. * * Note that this object can be empty if none of the data on the vdev * has been copied yet. */ uint64_t vic_mapping_object; /* * Object (in MOS) which contains the birth times for the mapping * entries. This object contains an array of * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus * buffer for this object is a vdev_indirect_birth_phys_t. This object * is allocated when a vdev removal is initiated. * * Note that this object can be empty if none of the vdev has yet been * copied. */ uint64_t vic_births_object; /* * This is the vdev ID which was removed previous to this vdev, or * UINT64_MAX if there are no previously removed vdevs. */ uint64_t vic_prev_indirect_vdev; } vdev_indirect_config_t; /* * Virtual device descriptor */ struct vdev { /* * Common to all vdev types. */ uint64_t vdev_id; /* child number in vdev parent */ uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid_sum; /* self guid + all child guids */ uint64_t vdev_orig_guid; /* orig. guid prior to remove */ uint64_t vdev_asize; /* allocatable device capacity */ uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_max_asize; /* max acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ /* * Logical block alignment shift * * The smallest sized/aligned I/O supported by the device. */ uint64_t vdev_logical_ashift; /* * Physical block alignment shift * * The device supports logical I/Os with vdev_logical_ashift * size/alignment, but optimum performance will be achieved by * aligning/sizing requests to vdev_physical_ashift. Smaller * requests may be inflated or incur device level read-modify-write * operations. * * May be 0 to indicate no preference (i.e. use vdev_logical_ashift). */ uint64_t vdev_physical_ashift; uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ vdev_ops_t *vdev_ops; /* vdev operations */ spa_t *vdev_spa; /* spa for this vdev */ void *vdev_tsd; /* type-specific data */ vdev_t *vdev_top; /* top-level vdev */ vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ vdev_stat_t vdev_stat; /* virtual device statistics */ vdev_stat_ex_t vdev_stat_ex; /* extended statistics */ boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ boolean_t vdev_nonrot; /* true if solid state */ int vdev_load_error; /* error on last load */ int vdev_open_error; /* error on last open */ int vdev_validate_error; /* error on last validate */ kthread_t *vdev_open_thread; /* thread opening children */ kthread_t *vdev_validate_thread; /* thread validating children */ uint64_t vdev_crtxg; /* txg when top-level was added */ uint64_t vdev_root_zap; /* * Top-level vdev state. */ uint64_t vdev_ms_array; /* metaslab array object */ uint64_t vdev_ms_shift; /* metaslab size shift */ uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ boolean_t vdev_probe_wanted; /* async probe wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ uint64_t vdev_failfast; /* device failfast setting */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ /* Initialize related */ boolean_t vdev_initialize_exit_wanted; vdev_initializing_state_t vdev_initialize_state; list_node_t vdev_initialize_node; kthread_t *vdev_initialize_thread; /* Protects vdev_initialize_thread and vdev_initialize_state. */ kmutex_t vdev_initialize_lock; kcondvar_t vdev_initialize_cv; uint64_t vdev_initialize_offset[TXG_SIZE]; uint64_t vdev_initialize_last_offset; range_tree_t *vdev_initialize_tree; /* valid while initializing */ uint64_t vdev_initialize_bytes_est; uint64_t vdev_initialize_bytes_done; uint64_t vdev_initialize_action_time; /* start and end time */ /* TRIM related */ boolean_t vdev_trim_exit_wanted; boolean_t vdev_autotrim_exit_wanted; vdev_trim_state_t vdev_trim_state; list_node_t vdev_trim_node; kmutex_t vdev_autotrim_lock; kcondvar_t vdev_autotrim_cv; kcondvar_t vdev_autotrim_kick_cv; kthread_t *vdev_autotrim_thread; /* Protects vdev_trim_thread and vdev_trim_state. */ kmutex_t vdev_trim_lock; kcondvar_t vdev_trim_cv; kthread_t *vdev_trim_thread; uint64_t vdev_trim_offset[TXG_SIZE]; uint64_t vdev_trim_last_offset; uint64_t vdev_trim_bytes_est; uint64_t vdev_trim_bytes_done; uint64_t vdev_trim_rate; /* requested rate (bytes/sec) */ uint64_t vdev_trim_partial; /* requested partial TRIM */ uint64_t vdev_trim_secure; /* requested secure TRIM */ uint64_t vdev_trim_action_time; /* start and end time */ /* Rebuild related */ boolean_t vdev_rebuilding; boolean_t vdev_rebuild_exit_wanted; boolean_t vdev_rebuild_cancel_wanted; boolean_t vdev_rebuild_reset_wanted; kmutex_t vdev_rebuild_lock; kcondvar_t vdev_rebuild_cv; kthread_t *vdev_rebuild_thread; vdev_rebuild_t vdev_rebuild_config; /* For limiting outstanding I/Os (initialize, TRIM) */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; uint64_t vdev_trim_inflight[3]; /* * Values stored in the config for an indirect or removing vdev. */ vdev_indirect_config_t vdev_indirect_config; /* * The vdev_indirect_rwlock protects the vdev_indirect_mapping * pointer from changing on indirect vdevs (when it is condensed). * Note that removing (not yet indirect) vdevs have different * access patterns (the mapping is not accessed from open context, * e.g. from zio_read) and locking strategy (e.g. svr_lock). */ krwlock_t vdev_indirect_rwlock; vdev_indirect_mapping_t *vdev_indirect_mapping; vdev_indirect_births_t *vdev_indirect_births; /* * In memory data structures used to manage the obsolete sm, for * indirect or removing vdevs. * * The vdev_obsolete_segments is the in-core record of the segments * that are no longer referenced anywhere in the pool (due to * being freed or remapped and not referenced by any snapshots). * During a sync, segments are added to vdev_obsolete_segments * via vdev_indirect_mark_obsolete(); at the end of each sync * pass, this is appended to vdev_obsolete_sm via * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock * protects against concurrent modifications of vdev_obsolete_segments * from multiple zio threads. */ kmutex_t vdev_obsolete_lock; range_tree_t *vdev_obsolete_segments; space_map_t *vdev_obsolete_sm; /* * Protects the vdev_scan_io_queue field itself as well as the * structure's contents (when present). */ kmutex_t vdev_scan_io_queue_lock; struct dsl_scan_io_queue *vdev_scan_io_queue; /* * Leaf vdev state. */ range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ space_map_t *vdev_dtl_sm; /* dirty time log space map */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_dtl_object; /* DTL object */ uint64_t vdev_psize; /* physical device capacity */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ uint64_t vdev_offline; /* persistent offline state */ uint64_t vdev_faulted; /* persistent faulted state */ uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ char *vdev_enc_sysfs_path; /* enclosure sysfs path */ char *vdev_fru; /* physical FRU location */ uint64_t vdev_not_present; /* not present during import */ uint64_t vdev_unspare; /* unspare when resilvering done */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ boolean_t vdev_has_trim; /* TRIM is supported */ boolean_t vdev_has_securetrim; /* secure TRIM is supported */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ boolean_t vdev_splitting; /* split or repair in progress */ boolean_t vdev_delayed_close; /* delayed device close? */ boolean_t vdev_tmpoffline; /* device taken offline temporarily? */ boolean_t vdev_detached; /* device detached? */ boolean_t vdev_cant_read; /* vdev is failing all reads */ boolean_t vdev_cant_write; /* vdev is failing all writes */ boolean_t vdev_isspare; /* was a hot spare */ boolean_t vdev_isl2cache; /* was a l2cache device */ boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */ boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ boolean_t vdev_attaching; /* vdev attach ashift handling */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ uint64_t vdev_leaf_zap; hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ uint64_t vdev_expansion_time; /* vdev's last expansion time */ list_node_t vdev_leaf_node; /* leaf vdev list */ /* * For DTrace to work in userland (libzpool) context, these fields must * remain at the end of the structure. DTrace will use the kernel's * CTF definition for 'struct vdev', and since the size of a kmutex_t is * larger in userland, the offsets for the rest of the fields would be * incorrect. */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ kmutex_t vdev_stat_lock; /* vdev_stat */ kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ /* * We rate limit ZIO delay, deadman, and checksum events, since they * can flood ZED with tons of events when a drive is acting up. */ zfs_ratelimit_t vdev_delay_rl; zfs_ratelimit_t vdev_deadman_rl; zfs_ratelimit_t vdev_checksum_rl; /* * Checksum and IO thresholds for tuning ZED */ uint64_t vdev_checksum_n; uint64_t vdev_checksum_t; uint64_t vdev_io_n; uint64_t vdev_io_t; }; #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) /* * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock * ring when MMP is enabled. */ #define MMP_BLOCKS_PER_LABEL 1 /* The largest uberblock we support is 8k. */ #define MAX_UBERBLOCK_SHIFT (13) #define VDEV_UBERBLOCK_SHIFT(vd) \ MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ MAX_UBERBLOCK_SHIFT) #define VDEV_UBERBLOCK_COUNT(vd) \ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_UBERBLOCK_OFFSET(vd, n) \ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) typedef struct vdev_phys { char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; zio_eck_t vp_zbt; } vdev_phys_t; typedef enum vbe_vers { /* * The bootenv file is stored as ascii text in the envblock. * It is used by the GRUB bootloader used on Linux to store the * contents of the grubenv file. The file is stored as raw ASCII, * and is protected by an embedded checksum. By default, GRUB will * check if the boot filesystem supports storing the environment data * in a special location, and if so, will invoke filesystem specific * logic to retrieve it. This can be overridden by a variable, should * the user so desire. */ VB_RAW = 0, /* * The bootenv file is converted to an nvlist and then packed into the * envblock. */ VB_NVLIST = 1 } vbe_vers_t; typedef struct vdev_boot_envblock { uint64_t vbe_version; char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - sizeof (zio_eck_t)]; zio_eck_t vbe_zbt; } vdev_boot_envblock_t; _Static_assert(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE, "vdev_boot_envblock_t wrong size"); typedef struct vdev_label { char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ vdev_boot_envblock_t vl_be; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ /* * vdev_dirty() flags */ #define VDD_METASLAB 0x01 #define VDD_DTL 0x02 /* Offset of embedded boot loader region on each label */ #define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) /* * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. */ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ /* * Size of label regions at the start and end of each leaf device. */ #define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 #define VDEV_BEST_LABEL VDEV_LABELS #define VDEV_OFFSET_IS_LABEL(vd, off) \ (((off) < VDEV_LABEL_START_SIZE) || \ ((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE))) #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_L2CACHE 3 #define VDEV_ALLOC_ROOTPOOL 4 #define VDEV_ALLOC_SPLIT 5 #define VDEV_ALLOC_ATTACH 6 /* * Allocate or free a vdev */ extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops); extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, vdev_t *parent, uint_t id, int alloctype); extern void vdev_free(vdev_t *vd); /* * Add or remove children and parents */ extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_compact_children(vdev_t *pvd); extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ extern boolean_t vdev_log_state_valid(vdev_t *vd); extern int vdev_load(vdev_t *vd); extern int vdev_dtl_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg); /* * Available vdev types. */ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; extern vdev_ops_t vdev_draid_ops; extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_alloc(vdev_t *vd); extern uint64_t vdev_get_nparity(vdev_t *vd); extern uint64_t vdev_get_ndisks(vdev_t *vd); /* * Global variables */ extern int zfs_vdev_standard_sm_blksz; /* * Functions from vdev_indirect.c */ extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx); extern boolean_t vdev_indirect_should_condense(vdev_t *vd); extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); extern int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj); extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); /* * Other miscellaneous functions */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); /* * Vdev ashift optimization tunables */ extern uint_t zfs_vdev_min_auto_ashift; extern uint_t zfs_vdev_max_auto_ashift; int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); #ifdef __cplusplus } #endif #endif /* _SYS_VDEV_IMPL_H */ diff --git a/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h b/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h index c1037fa12e30..73c26dff1e0e 100644 --- a/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h @@ -1,389 +1,389 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. */ #ifndef _VDEV_RAIDZ_H #define _VDEV_RAIDZ_H #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif #define CODE_P (0U) #define CODE_Q (1U) #define CODE_R (2U) #define PARITY_P (1U) #define PARITY_PQ (2U) #define PARITY_PQR (3U) #define TARGET_X (0U) #define TARGET_Y (1U) #define TARGET_Z (2U) /* * Parity generation methods indexes */ enum raidz_math_gen_op { RAIDZ_GEN_P = 0, RAIDZ_GEN_PQ, RAIDZ_GEN_PQR, RAIDZ_GEN_NUM = 3 }; /* * Data reconstruction methods indexes */ enum raidz_rec_op { RAIDZ_REC_P = 0, RAIDZ_REC_Q, RAIDZ_REC_R, RAIDZ_REC_PQ, RAIDZ_REC_PR, RAIDZ_REC_QR, RAIDZ_REC_PQR, RAIDZ_REC_NUM = 7 }; extern const char *const raidz_gen_name[RAIDZ_GEN_NUM]; extern const char *const raidz_rec_name[RAIDZ_REC_NUM]; /* * Methods used to define raidz implementation * * @raidz_gen_f Parity generation function * @par1 pointer to raidz_map * @raidz_rec_f Data reconstruction function * @par1 pointer to raidz_map * @par2 array of reconstruction targets * @will_work_f Function returns TRUE if impl. is supported on the system * @init_impl_f Function is called once on init * @fini_impl_f Function is called once on fini */ typedef void (*raidz_gen_f)(void *); typedef int (*raidz_rec_f)(void *, const int *); typedef boolean_t (*will_work_f)(void); typedef void (*init_impl_f)(void); typedef void (*fini_impl_f)(void); #define RAIDZ_IMPL_NAME_MAX (20) typedef struct raidz_impl_ops { init_impl_f init; fini_impl_f fini; raidz_gen_f gen[RAIDZ_GEN_NUM]; /* Parity generate functions */ raidz_rec_f rec[RAIDZ_REC_NUM]; /* Data reconstruction functions */ will_work_f is_supported; /* Support check function */ char name[RAIDZ_IMPL_NAME_MAX]; /* Name of the implementation */ } raidz_impl_ops_t; typedef struct raidz_col { uint64_t rc_devidx; /* child device index for I/O */ uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ abd_t rc_abdstruct; /* rc_abd probably points here */ abd_t *rc_abd; /* I/O data */ abd_t *rc_orig_data; /* pre-reconstruction */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ uint8_t rc_force_repair; /* Write good data to this column */ uint8_t rc_allow_repair; /* Allow repair I/O to this column */ } raidz_col_t; typedef struct raidz_row { uint64_t rr_cols; /* Regular column count */ uint64_t rr_scols; /* Count including skipped columns */ uint64_t rr_bigcols; /* Remainder data column count */ uint64_t rr_missingdata; /* Count of missing data devices */ uint64_t rr_missingparity; /* Count of missing parity devices */ uint64_t rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ #endif - raidz_col_t rr_col[0]; /* Flexible array of I/O columns */ + raidz_col_t rr_col[]; /* Flexible array of I/O columns */ } raidz_row_t; typedef struct raidz_map { boolean_t rm_ecksuminjected; /* checksum error was injected */ int rm_nrows; /* Regular row count */ int rm_nskip; /* RAIDZ sectors skipped for padding */ int rm_skipstart; /* Column index of padding start */ const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ - raidz_row_t *rm_row[0]; /* flexible array of rows */ + raidz_row_t *rm_row[]; /* flexible array of rows */ } raidz_map_t; #define RAIDZ_ORIGINAL_IMPL (INT_MAX) extern const raidz_impl_ops_t vdev_raidz_scalar_impl; extern boolean_t raidz_will_scalar_work(void); #if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */ extern const raidz_impl_ops_t vdev_raidz_sse2_impl; #endif #if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */ extern const raidz_impl_ops_t vdev_raidz_ssse3_impl; #endif #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */ extern const raidz_impl_ops_t vdev_raidz_avx2_impl; #endif #if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */ extern const raidz_impl_ops_t vdev_raidz_avx512f_impl; #endif #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ extern const raidz_impl_ops_t vdev_raidz_avx512bw_impl; #endif #if defined(__aarch64__) extern const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl; extern const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl; #endif #if defined(__powerpc__) extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl; #endif /* * Commonly used raidz_map helpers * * raidz_parity Returns parity of the RAIDZ block * raidz_ncols Returns number of columns the block spans * Note, all rows have the same number of columns. * raidz_nbigcols Returns number of big columns * raidz_col_p Returns pointer to a column * raidz_col_size Returns size of a column * raidz_big_size Returns size of big columns * raidz_short_size Returns size of short columns */ #define raidz_parity(rm) ((rm)->rm_row[0]->rr_firstdatacol) #define raidz_ncols(rm) ((rm)->rm_row[0]->rr_cols) #define raidz_nbigcols(rm) ((rm)->rm_bigcols) #define raidz_col_p(rm, c) ((rm)->rm_col + (c)) #define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size) #define raidz_big_size(rm) (raidz_col_size(rm, CODE_P)) #define raidz_short_size(rm) (raidz_col_size(rm, raidz_ncols(rm)-1)) /* * Macro defines an RAIDZ parity generation method * * @code parity the function produce * @impl name of the implementation */ #define _RAIDZ_GEN_WRAP(code, impl) \ static void \ impl ## _gen_ ## code(void *rrp) \ { \ raidz_row_t *rr = (raidz_row_t *)rrp; \ raidz_generate_## code ## _impl(rr); \ } /* * Macro defines an RAIDZ data reconstruction method * * @code parity the function produce * @impl name of the implementation */ #define _RAIDZ_REC_WRAP(code, impl) \ static int \ impl ## _rec_ ## code(void *rrp, const int *tgtidx) \ { \ raidz_row_t *rr = (raidz_row_t *)rrp; \ return (raidz_reconstruct_## code ## _impl(rr, tgtidx)); \ } /* * Define all gen methods for an implementation * * @impl name of the implementation */ #define DEFINE_GEN_METHODS(impl) \ _RAIDZ_GEN_WRAP(p, impl); \ _RAIDZ_GEN_WRAP(pq, impl); \ _RAIDZ_GEN_WRAP(pqr, impl) /* * Define all rec functions for an implementation * * @impl name of the implementation */ #define DEFINE_REC_METHODS(impl) \ _RAIDZ_REC_WRAP(p, impl); \ _RAIDZ_REC_WRAP(q, impl); \ _RAIDZ_REC_WRAP(r, impl); \ _RAIDZ_REC_WRAP(pq, impl); \ _RAIDZ_REC_WRAP(pr, impl); \ _RAIDZ_REC_WRAP(qr, impl); \ _RAIDZ_REC_WRAP(pqr, impl) #define RAIDZ_GEN_METHODS(impl) \ { \ [RAIDZ_GEN_P] = & impl ## _gen_p, \ [RAIDZ_GEN_PQ] = & impl ## _gen_pq, \ [RAIDZ_GEN_PQR] = & impl ## _gen_pqr \ } #define RAIDZ_REC_METHODS(impl) \ { \ [RAIDZ_REC_P] = & impl ## _rec_p, \ [RAIDZ_REC_Q] = & impl ## _rec_q, \ [RAIDZ_REC_R] = & impl ## _rec_r, \ [RAIDZ_REC_PQ] = & impl ## _rec_pq, \ [RAIDZ_REC_PR] = & impl ## _rec_pr, \ [RAIDZ_REC_QR] = & impl ## _rec_qr, \ [RAIDZ_REC_PQR] = & impl ## _rec_pqr \ } typedef struct raidz_impl_kstat { uint64_t gen[RAIDZ_GEN_NUM]; /* gen method speed B/s */ uint64_t rec[RAIDZ_REC_NUM]; /* rec method speed B/s */ } raidz_impl_kstat_t; /* * Enumerate various multiplication constants * used in reconstruction methods */ typedef enum raidz_mul_info { /* Reconstruct Q */ MUL_Q_X = 0, /* Reconstruct R */ MUL_R_X = 0, /* Reconstruct PQ */ MUL_PQ_X = 0, MUL_PQ_Y = 1, /* Reconstruct PR */ MUL_PR_X = 0, MUL_PR_Y = 1, /* Reconstruct QR */ MUL_QR_XQ = 0, MUL_QR_X = 1, MUL_QR_YQ = 2, MUL_QR_Y = 3, /* Reconstruct PQR */ MUL_PQR_XP = 0, MUL_PQR_XQ = 1, MUL_PQR_XR = 2, MUL_PQR_YU = 3, MUL_PQR_YP = 4, MUL_PQR_YQ = 5, MUL_CNT = 6 } raidz_mul_info_t; /* * Powers of 2 in the Galois field. */ extern const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))); /* Logs of 2 in the Galois field defined above. */ extern const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))); /* * Multiply a given number by 2 raised to the given power. */ static inline uint8_t vdev_raidz_exp2(const uint8_t a, const unsigned exp) { if (a == 0) return (0); return (vdev_raidz_pow2[(exp + (unsigned)vdev_raidz_log2[a]) % 255]); } /* * Galois Field operations. * * gf_exp2 - computes 2 raised to the given power * gf_exp4 - computes 4 raised to the given power * gf_mul - multiplication * gf_div - division * gf_inv - multiplicative inverse */ typedef unsigned gf_t; typedef unsigned gf_log_t; static inline gf_t gf_mul(const gf_t a, const gf_t b) { gf_log_t logsum; if (a == 0 || b == 0) return (0); logsum = (gf_log_t)vdev_raidz_log2[a] + (gf_log_t)vdev_raidz_log2[b]; return ((gf_t)vdev_raidz_pow2[logsum % 255]); } static inline gf_t gf_div(const gf_t a, const gf_t b) { gf_log_t logsum; ASSERT3U(b, >, 0); if (a == 0) return (0); logsum = (gf_log_t)255 + (gf_log_t)vdev_raidz_log2[a] - (gf_log_t)vdev_raidz_log2[b]; return ((gf_t)vdev_raidz_pow2[logsum % 255]); } static inline gf_t gf_inv(const gf_t a) { gf_log_t logsum; ASSERT3U(a, >, 0); logsum = (gf_log_t)255 - (gf_log_t)vdev_raidz_log2[a]; return ((gf_t)vdev_raidz_pow2[logsum]); } static inline gf_t gf_exp2(gf_log_t exp) { return (vdev_raidz_pow2[exp % 255]); } static inline gf_t gf_exp4(gf_log_t exp) { ASSERT3U(exp, <=, 255); return ((gf_t)vdev_raidz_pow2[(2 * exp) % 255]); } #ifdef __cplusplus } #endif #endif /* _VDEV_RAIDZ_H */ diff --git a/sys/contrib/openzfs/include/sys/zfs_context.h b/sys/contrib/openzfs/include/sys/zfs_context.h index 6a337b49edf3..750ca612b962 100644 --- a/sys/contrib/openzfs/include/sys/zfs_context.h +++ b/sys/contrib/openzfs/include/sys/zfs_context.h @@ -1,788 +1,790 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H #ifdef __cplusplus extern "C" { #endif /* * This code compiles in three different contexts. When __KERNEL__ is defined, * the code uses "unix-like" kernel interfaces. When _STANDALONE is defined, the * code is running in a reduced capacity environment of the boot loader which is * generally a subset of both POSIX and kernel interfaces (with a few unique * interfaces too). When neither are defined, it's in a userland POSIX or * similar environment. */ #if defined(__KERNEL__) || defined(_STANDALONE) #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #else /* _KERNEL || _STANDALONE */ #define _SYS_MUTEX_H #define _SYS_RWLOCK_H #define _SYS_CONDVAR_H #define _SYS_VNODE_H #define _SYS_VFS_H #define _SYS_SUNDDI_H #define _SYS_CALLB_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Stack */ #define noinline __attribute__((noinline)) #define likely(x) __builtin_expect((x), 1) #define unlikely(x) __builtin_expect((x), 0) /* * Debugging */ /* * Note that we are not using the debugging levels. */ #define CE_CONT 0 /* continuation */ #define CE_NOTE 1 /* notice */ #define CE_WARN 2 /* warning */ #define CE_PANIC 3 /* panic */ #define CE_IGNORE 4 /* print nothing */ /* * ZFS debugging */ extern void dprintf_setup(int *argc, char **argv); extern void cmn_err(int, const char *, ...) __attribute__((format(printf, 2, 3))); extern void vcmn_err(int, const char *, va_list) __attribute__((format(printf, 2, 0))); extern void panic(const char *, ...) __attribute__((format(printf, 1, 2), noreturn)); extern void vpanic(const char *, va_list) __attribute__((format(printf, 1, 0), noreturn)); #define fm_panic panic /* * DTrace SDT probes have different signatures in userland than they do in * the kernel. If they're being used in kernel code, re-define them out of * existence for their counterparts in libzpool. * * Here's an example of how to use the set-error probes in userland: * zfs$target:::set-error /arg0 == EBUSY/ {stack();} * * Here's an example of how to use DTRACE_PROBE probes in userland: * If there is a probe declared as follows: * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn); * Then you can use it as follows: * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/ * {printf("%u %p\n", arg1, arg2);} */ #ifdef DTRACE_PROBE #undef DTRACE_PROBE #endif /* DTRACE_PROBE */ #define DTRACE_PROBE(a) #ifdef DTRACE_PROBE1 #undef DTRACE_PROBE1 #endif /* DTRACE_PROBE1 */ #define DTRACE_PROBE1(a, b, c) #ifdef DTRACE_PROBE2 #undef DTRACE_PROBE2 #endif /* DTRACE_PROBE2 */ #define DTRACE_PROBE2(a, b, c, d, e) #ifdef DTRACE_PROBE3 #undef DTRACE_PROBE3 #endif /* DTRACE_PROBE3 */ #define DTRACE_PROBE3(a, b, c, d, e, f, g) #ifdef DTRACE_PROBE4 #undef DTRACE_PROBE4 #endif /* DTRACE_PROBE4 */ #define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) /* * Tunables. */ typedef struct zfs_kernel_param { const char *name; /* unused stub */ } zfs_kernel_param_t; #define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) #define ZFS_MODULE_PARAM_ARGS void #define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \ getfunc, perm, desc) /* * Threads. */ typedef pthread_t kthread_t; #define TS_RUN 0x00000002 #define TS_JOINABLE 0x00000004 #define curthread ((void *)(uintptr_t)pthread_self()) #define getcomm() "unknown" #define thread_create_named(name, stk, stksize, func, arg, len, \ pp, state, pri) \ zk_thread_create(func, arg, stksize, state) #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(func, arg, stksize, state) #define thread_exit() pthread_exit(NULL) #define thread_join(t) pthread_join((pthread_t)(t), NULL) #define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) /* in libzpool, p0 exists only to have its address taken */ typedef struct proc { uintptr_t this_is_never_used_dont_dereference_it; } proc_t; extern struct proc p0; #define curproc (&p0) #define PS_NONE -1 extern kthread_t *zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state); #define issig(why) (FALSE) #define ISSIG(thr, why) (FALSE) #define KPREEMPT_SYNC (-1) #define kpreempt(x) sched_yield() #define kpreempt_disable() ((void)0) #define kpreempt_enable() ((void)0) /* * Mutexes */ typedef struct kmutex { pthread_mutex_t m_lock; pthread_t m_owner; } kmutex_t; #define MUTEX_DEFAULT 0 #define MUTEX_NOLOCKDEP MUTEX_DEFAULT #define MUTEX_HELD(mp) pthread_equal((mp)->m_owner, pthread_self()) #define MUTEX_NOT_HELD(mp) !MUTEX_HELD(mp) extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); extern void mutex_destroy(kmutex_t *mp); extern void mutex_enter(kmutex_t *mp); +extern int mutex_enter_check_return(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); #define NESTED_SINGLE 1 #define mutex_enter_nested(mp, class) mutex_enter(mp) +#define mutex_enter_interruptible(mp) mutex_enter_check_return(mp) /* * RW locks */ typedef struct krwlock { pthread_rwlock_t rw_lock; pthread_t rw_owner; uint_t rw_readers; } krwlock_t; typedef int krw_t; #define RW_READER 0 #define RW_WRITER 1 #define RW_DEFAULT RW_READER #define RW_NOLOCKDEP RW_READER #define RW_READ_HELD(rw) ((rw)->rw_readers > 0) #define RW_WRITE_HELD(rw) pthread_equal((rw)->rw_owner, pthread_self()) #define RW_LOCK_HELD(rw) (RW_READ_HELD(rw) || RW_WRITE_HELD(rw)) extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); extern void rw_destroy(krwlock_t *rwlp); extern void rw_enter(krwlock_t *rwlp, krw_t rw); extern int rw_tryenter(krwlock_t *rwlp, krw_t rw); extern int rw_tryupgrade(krwlock_t *rwlp); extern void rw_exit(krwlock_t *rwlp); #define rw_downgrade(rwlp) do { } while (0) /* * Credentials */ extern uid_t crgetuid(cred_t *cr); extern uid_t crgetruid(cred_t *cr); extern gid_t crgetgid(cred_t *cr); extern int crgetngroups(cred_t *cr); extern gid_t *crgetgroups(cred_t *cr); /* * Condition variables */ typedef pthread_cond_t kcondvar_t; #define CV_DEFAULT 0 #define CALLOUT_FLAG_ABSOLUTE 0x2 extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); extern void cv_destroy(kcondvar_t *cv); extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); extern int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp); extern int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); extern int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag); extern void cv_signal(kcondvar_t *cv); extern void cv_broadcast(kcondvar_t *cv); #define cv_timedwait_io(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_timedwait_idle(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_timedwait_sig(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_wait_io(cv, mp) cv_wait(cv, mp) #define cv_wait_idle(cv, mp) cv_wait(cv, mp) #define cv_wait_io_sig(cv, mp) cv_wait_sig(cv, mp) #define cv_timedwait_sig_hires(cv, mp, t, r, f) \ cv_timedwait_hires(cv, mp, t, r, f) #define cv_timedwait_idle_hires(cv, mp, t, r, f) \ cv_timedwait_hires(cv, mp, t, r, f) /* * Thread-specific data */ #define tsd_get(k) pthread_getspecific(k) #define tsd_set(k, v) pthread_setspecific(k, v) #define tsd_create(kp, d) pthread_key_create((pthread_key_t *)kp, d) #define tsd_destroy(kp) /* nothing */ #ifdef __FreeBSD__ typedef off_t loff_t; #endif /* * kstat creation, installation and deletion */ extern kstat_t *kstat_create(const char *, int, const char *, const char *, uchar_t, ulong_t, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); extern void kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)); /* * procfs list manipulation */ typedef struct procfs_list { void *pl_private; kmutex_t pl_lock; list_t pl_list; uint64_t pl_next_id; size_t pl_node_offset; } procfs_list_t; #ifndef __cplusplus struct seq_file { }; void seq_printf(struct seq_file *m, const char *fmt, ...); typedef struct procfs_list_node { list_node_t pln_link; uint64_t pln_id; } procfs_list_node_t; void procfs_list_install(const char *module, const char *submodule, const char *name, mode_t mode, procfs_list_t *procfs_list, int (*show)(struct seq_file *f, void *p), int (*show_header)(struct seq_file *f), int (*clear)(procfs_list_t *procfs_list), size_t procfs_list_node_off); void procfs_list_uninstall(procfs_list_t *procfs_list); void procfs_list_destroy(procfs_list_t *procfs_list); void procfs_list_add(procfs_list_t *procfs_list, void *p); #endif /* * Kernel memory */ #define KM_SLEEP UMEM_NOFAIL #define KM_PUSHPAGE KM_SLEEP #define KM_NOSLEEP UMEM_DEFAULT #define KM_NORMALPRI 0 /* not needed with UMEM_DEFAULT */ #define KMC_NODEBUG UMC_NODEBUG #define KMC_KVMEM 0x0 #define kmem_alloc(_s, _f) umem_alloc(_s, _f) #define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) #define kmem_free(_b, _s) umem_free(_b, _s) #define vmem_alloc(_s, _f) kmem_alloc(_s, _f) #define vmem_zalloc(_s, _f) kmem_zalloc(_s, _f) #define vmem_free(_b, _s) kmem_free(_b, _s) #define kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \ umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) #define kmem_cache_destroy(_c) umem_cache_destroy(_c) #define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f) #define kmem_cache_free(_c, _b) umem_cache_free(_c, _b) #define kmem_debugging() 0 #define kmem_cache_reap_now(_c) umem_cache_reap_now(_c); #define kmem_cache_set_move(_c, _cb) /* nothing */ #define POINTER_INVALIDATE(_pp) /* nothing */ #define POINTER_IS_VALID(_p) 0 typedef umem_cache_t kmem_cache_t; typedef enum kmem_cbrc { KMEM_CBRC_YES, KMEM_CBRC_NO, KMEM_CBRC_LATER, KMEM_CBRC_DONT_NEED, KMEM_CBRC_DONT_KNOW } kmem_cbrc_t; /* * Task queues */ #define TASKQ_NAMELEN 31 typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); typedef struct taskq_ent { struct taskq_ent *tqent_next; struct taskq_ent *tqent_prev; task_func_t *tqent_func; void *tqent_arg; uintptr_t tqent_flags; } taskq_ent_t; typedef struct taskq { char tq_name[TASKQ_NAMELEN + 1]; kmutex_t tq_lock; krwlock_t tq_threadlock; kcondvar_t tq_dispatch_cv; kcondvar_t tq_wait_cv; kthread_t **tq_threadlist; int tq_flags; int tq_active; int tq_nthreads; int tq_nalloc; int tq_minalloc; int tq_maxalloc; kcondvar_t tq_maxalloc_cv; int tq_maxalloc_wait; taskq_ent_t *tq_freelist; taskq_ent_t tq_task; } taskq_t; #define TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */ #define TASKQ_PREPOPULATE 0x0001 #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ #define TASKQ_THREADS_CPU_PCT 0x0008 /* Scale # threads by # cpus */ #define TASKQ_DC_BATCH 0x0010 /* Mark threads as batch */ #define TQ_SLEEP KM_SLEEP /* Can block for memory */ #define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ #define TQ_FRONT 0x08 /* Queue in front */ #define TASKQID_INVALID ((taskqid_t)0) extern taskq_t *system_taskq; extern taskq_t *system_delay_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); #define taskq_create_proc(a, b, c, d, e, p, f) \ (taskq_create(a, b, c, d, e, f)) #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ ((void) sizeof (dc), taskq_create(a, b, maxclsyspri, d, e, f)) extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, clock_t); extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); extern void taskq_init_ent(taskq_ent_t *); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern int taskq_member(taskq_t *, kthread_t *); extern taskq_t *taskq_of_curthread(void); extern int taskq_cancel_id(taskq_t *, taskqid_t); extern void system_taskq_init(void); extern void system_taskq_fini(void); #define XVA_MAPSIZE 3 #define XVA_MAGIC 0x78766174 extern char *vn_dumpdir; #define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ typedef struct xoptattr { inode_timespec_t xoa_createtime; /* Create time of file */ uint8_t xoa_archive; uint8_t xoa_system; uint8_t xoa_readonly; uint8_t xoa_hidden; uint8_t xoa_nounlink; uint8_t xoa_immutable; uint8_t xoa_appendonly; uint8_t xoa_nodump; uint8_t xoa_settable; uint8_t xoa_opaque; uint8_t xoa_av_quarantined; uint8_t xoa_av_modified; uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; uint8_t xoa_reparse; uint8_t xoa_offline; uint8_t xoa_sparse; } xoptattr_t; typedef struct vattr { uint_t va_mask; /* bit-mask of attributes */ u_offset_t va_size; /* file size in bytes */ } vattr_t; typedef struct xvattr { vattr_t xva_vattr; /* Embedded vattr structure */ uint32_t xva_magic; /* Magic Number */ uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ xoptattr_t xva_xoptattrs; /* Optional attributes */ } xvattr_t; typedef struct vsecattr { uint_t vsa_mask; /* See below */ int vsa_aclcnt; /* ACL entry count */ void *vsa_aclentp; /* pointer to ACL entries */ int vsa_dfaclcnt; /* default ACL entry count */ void *vsa_dfaclentp; /* pointer to default ACL entries */ size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ } vsecattr_t; #define AT_MODE 0x00002 #define AT_UID 0x00004 #define AT_GID 0x00008 #define AT_FSID 0x00010 #define AT_NODEID 0x00020 #define AT_NLINK 0x00040 #define AT_SIZE 0x00080 #define AT_ATIME 0x00100 #define AT_MTIME 0x00200 #define AT_CTIME 0x00400 #define AT_RDEV 0x00800 #define AT_BLKSIZE 0x01000 #define AT_NBLOCKS 0x02000 #define AT_SEQ 0x08000 #define AT_XVATTR 0x10000 #define CRCREAT 0 #define F_FREESP 11 #define FIGNORECASE 0x80000 /* request case-insensitive lookups */ /* * Random stuff */ #define ddi_get_lbolt() (gethrtime() >> 23) #define ddi_get_lbolt64() (gethrtime() >> 23) #define hz 119 /* frequency when using gethrtime() >> 23 for lbolt */ #define ddi_time_before(a, b) (a < b) #define ddi_time_after(a, b) ddi_time_before(b, a) #define ddi_time_before_eq(a, b) (!ddi_time_after(a, b)) #define ddi_time_after_eq(a, b) ddi_time_before_eq(b, a) #define ddi_time_before64(a, b) (a < b) #define ddi_time_after64(a, b) ddi_time_before64(b, a) #define ddi_time_before_eq64(a, b) (!ddi_time_after64(a, b)) #define ddi_time_after_eq64(a, b) ddi_time_before_eq64(b, a) extern void delay(clock_t ticks); #define SEC_TO_TICK(sec) ((sec) * hz) #define MSEC_TO_TICK(msec) (howmany((hrtime_t)(msec) * hz, MILLISEC)) #define USEC_TO_TICK(usec) (howmany((hrtime_t)(usec) * hz, MICROSEC)) #define NSEC_TO_TICK(nsec) (howmany((hrtime_t)(nsec) * hz, NANOSEC)) #define max_ncpus 64 #define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN)) /* * Process priorities as defined by setpriority(2) and getpriority(2). */ #define minclsyspri 19 #define maxclsyspri -20 #define defclsyspri 0 #define CPU_SEQID ((uintptr_t)pthread_self() & (max_ncpus - 1)) #define CPU_SEQID_UNSTABLE CPU_SEQID #define kcred NULL #define CRED() NULL #define ptob(x) ((x) * PAGESIZE) #define NN_DIVISOR_1000 (1U << 0) #define NN_NUMBUF_SZ (6) extern uint64_t physmem; extern const char *random_path; extern const char *urandom_path; extern int highbit64(uint64_t i); extern int lowbit64(uint64_t i); extern int random_get_bytes(uint8_t *ptr, size_t len); extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); static __inline__ uint32_t random_in_range(uint32_t range) { uint32_t r; ASSERT(range != 0); if (range == 1) return (0); (void) random_get_pseudo_bytes((uint8_t *)&r, sizeof (r)); return (r % range); } extern void kernel_init(int mode); extern void kernel_fini(void); extern void random_init(void); extern void random_fini(void); struct spa; extern void show_pool_stats(struct spa *); extern int set_global_var(char const *arg); typedef struct callb_cpr { kmutex_t *cc_lockp; } callb_cpr_t; #define CALLB_CPR_INIT(cp, lockp, func, name) { \ (cp)->cc_lockp = lockp; \ } #define CALLB_CPR_SAFE_BEGIN(cp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ } #define CALLB_CPR_SAFE_END(cp, lockp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ } #define CALLB_CPR_EXIT(cp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ mutex_exit((cp)->cc_lockp); \ } #define zone_dataset_visible(x, y) (1) #define INGLOBALZONE(z) (1) extern uint32_t zone_get_hostid(void *zonep); extern char *kmem_vasprintf(const char *fmt, va_list adx); extern char *kmem_asprintf(const char *fmt, ...); #define kmem_strfree(str) kmem_free((str), strlen(str) + 1) #define kmem_strdup(s) strdup(s) #ifndef __cplusplus extern int kmem_scnprintf(char *restrict str, size_t size, const char *restrict fmt, ...); #endif /* * Hostname information */ extern int ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result); typedef struct utsname utsname_t; extern utsname_t *utsname(void); /* ZFS Boot Related stuff. */ struct _buf { intptr_t _fd; }; struct bootstat { uint64_t st_size; }; typedef struct ace_object { uid_t a_who; uint32_t a_access_mask; uint16_t a_flags; uint16_t a_type; uint8_t a_obj_type[16]; uint8_t a_inherit_obj_type[16]; } ace_object_t; #define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 #define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 #define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 #define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int secpolicy_zfs(const cred_t *cr); extern int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc); extern zoneid_t getzoneid(void); /* SID stuff */ typedef struct ksiddomain { uint_t kd_ref; uint_t kd_len; char *kd_name; } ksiddomain_t; ksiddomain_t *ksid_lookupdomain(const char *); void ksiddomain_rele(ksiddomain_t *); #define DDI_SLEEP KM_SLEEP #define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \ sysevent_post_event(_c, _d, _b, "libzpool", _e, _f) #define zfs_sleep_until(wakeup) \ do { \ hrtime_t delta = wakeup - gethrtime(); \ struct timespec ts; \ ts.tv_sec = delta / NANOSEC; \ ts.tv_nsec = delta % NANOSEC; \ (void) nanosleep(&ts, NULL); \ } while (0) typedef int fstrans_cookie_t; extern fstrans_cookie_t spl_fstrans_mark(void); extern void spl_fstrans_unmark(fstrans_cookie_t); extern int __spl_pf_fstrans_check(void); extern int kmem_cache_reap_active(void); /* * Kernel modules */ #define __init #define __exit #endif /* _KERNEL || _STANDALONE */ #ifdef __cplusplus }; #endif #endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.abi b/sys/contrib/openzfs/lib/libzfs/libzfs.abi index 8658d39e28fc..2d612a16b227 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs.abi +++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi @@ -1,9423 +1,9427 @@ + + + + diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_util.c b/sys/contrib/openzfs/lib/libzfs/libzfs_util.c index b94abea3d581..fdd1975fa677 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_util.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_util.c @@ -1,2073 +1,2266 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2020 Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2017 Datto Inc. * Copyright (c) 2020 The FreeBSD Foundation * * Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ /* * Internal utility routines for the ZFS library. */ #include #include #include #include #include #include #include #include #include #if LIBFETCH_DYNAMIC #include #endif #include #include #include #include #include #include #include #include "libzfs_impl.h" #include "zfs_prop.h" #include "zfeature_common.h" #include #include /* * We only care about the scheme in order to match the scheme * with the handler. Each handler should validate the full URI * as necessary. */ #define URI_REGEX "^\\([A-Za-z][A-Za-z0-9+.\\-]*\\):" int libzfs_errno(libzfs_handle_t *hdl) { return (hdl->libzfs_error); } const char * libzfs_error_action(libzfs_handle_t *hdl) { return (hdl->libzfs_action); } const char * libzfs_error_description(libzfs_handle_t *hdl) { if (hdl->libzfs_desc[0] != '\0') return (hdl->libzfs_desc); switch (hdl->libzfs_error) { case EZFS_NOMEM: return (dgettext(TEXT_DOMAIN, "out of memory")); case EZFS_BADPROP: return (dgettext(TEXT_DOMAIN, "invalid property value")); case EZFS_PROPREADONLY: return (dgettext(TEXT_DOMAIN, "read-only property")); case EZFS_PROPTYPE: return (dgettext(TEXT_DOMAIN, "property doesn't apply to " "datasets of this type")); case EZFS_PROPNONINHERIT: return (dgettext(TEXT_DOMAIN, "property cannot be inherited")); case EZFS_PROPSPACE: return (dgettext(TEXT_DOMAIN, "invalid quota or reservation")); case EZFS_BADTYPE: return (dgettext(TEXT_DOMAIN, "operation not applicable to " "datasets of this type")); case EZFS_BUSY: return (dgettext(TEXT_DOMAIN, "pool or dataset is busy")); case EZFS_EXISTS: return (dgettext(TEXT_DOMAIN, "pool or dataset exists")); case EZFS_NOENT: return (dgettext(TEXT_DOMAIN, "no such pool or dataset")); case EZFS_BADSTREAM: return (dgettext(TEXT_DOMAIN, "invalid backup stream")); case EZFS_DSREADONLY: return (dgettext(TEXT_DOMAIN, "dataset is read-only")); case EZFS_VOLTOOBIG: return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for " "this system")); case EZFS_INVALIDNAME: return (dgettext(TEXT_DOMAIN, "invalid name")); case EZFS_BADRESTORE: return (dgettext(TEXT_DOMAIN, "unable to restore to " "destination")); case EZFS_BADBACKUP: return (dgettext(TEXT_DOMAIN, "backup failed")); case EZFS_BADTARGET: return (dgettext(TEXT_DOMAIN, "invalid target vdev")); case EZFS_NODEVICE: return (dgettext(TEXT_DOMAIN, "no such device in pool")); case EZFS_BADDEV: return (dgettext(TEXT_DOMAIN, "invalid device")); case EZFS_NOREPLICAS: return (dgettext(TEXT_DOMAIN, "no valid replicas")); case EZFS_RESILVERING: return (dgettext(TEXT_DOMAIN, "currently resilvering")); case EZFS_BADVERSION: return (dgettext(TEXT_DOMAIN, "unsupported version or " "feature")); case EZFS_POOLUNAVAIL: return (dgettext(TEXT_DOMAIN, "pool is unavailable")); case EZFS_DEVOVERFLOW: return (dgettext(TEXT_DOMAIN, "too many devices in one vdev")); case EZFS_BADPATH: return (dgettext(TEXT_DOMAIN, "must be an absolute path")); case EZFS_CROSSTARGET: return (dgettext(TEXT_DOMAIN, "operation crosses datasets or " "pools")); case EZFS_ZONED: return (dgettext(TEXT_DOMAIN, "dataset in use by local zone")); case EZFS_MOUNTFAILED: return (dgettext(TEXT_DOMAIN, "mount failed")); case EZFS_UMOUNTFAILED: return (dgettext(TEXT_DOMAIN, "unmount failed")); case EZFS_UNSHARENFSFAILED: return (dgettext(TEXT_DOMAIN, "NFS share removal failed")); case EZFS_SHARENFSFAILED: return (dgettext(TEXT_DOMAIN, "NFS share creation failed")); case EZFS_UNSHARESMBFAILED: return (dgettext(TEXT_DOMAIN, "SMB share removal failed")); case EZFS_SHARESMBFAILED: return (dgettext(TEXT_DOMAIN, "SMB share creation failed")); case EZFS_PERM: return (dgettext(TEXT_DOMAIN, "permission denied")); case EZFS_NOSPC: return (dgettext(TEXT_DOMAIN, "out of space")); case EZFS_FAULT: return (dgettext(TEXT_DOMAIN, "bad address")); case EZFS_IO: return (dgettext(TEXT_DOMAIN, "I/O error")); case EZFS_INTR: return (dgettext(TEXT_DOMAIN, "signal received")); case EZFS_CKSUM: return (dgettext(TEXT_DOMAIN, "insufficient replicas")); case EZFS_ISSPARE: return (dgettext(TEXT_DOMAIN, "device is reserved as a hot " "spare")); case EZFS_INVALCONFIG: return (dgettext(TEXT_DOMAIN, "invalid vdev configuration")); case EZFS_RECURSIVE: return (dgettext(TEXT_DOMAIN, "recursive dataset dependency")); case EZFS_NOHISTORY: return (dgettext(TEXT_DOMAIN, "no history available")); case EZFS_POOLPROPS: return (dgettext(TEXT_DOMAIN, "failed to retrieve " "pool properties")); case EZFS_POOL_NOTSUP: return (dgettext(TEXT_DOMAIN, "operation not supported " "on this type of pool")); case EZFS_POOL_INVALARG: return (dgettext(TEXT_DOMAIN, "invalid argument for " "this pool operation")); case EZFS_NAMETOOLONG: return (dgettext(TEXT_DOMAIN, "dataset name is too long")); case EZFS_OPENFAILED: return (dgettext(TEXT_DOMAIN, "open failed")); case EZFS_NOCAP: return (dgettext(TEXT_DOMAIN, "disk capacity information could not be retrieved")); case EZFS_LABELFAILED: return (dgettext(TEXT_DOMAIN, "write of label failed")); case EZFS_BADWHO: return (dgettext(TEXT_DOMAIN, "invalid user/group")); case EZFS_BADPERM: return (dgettext(TEXT_DOMAIN, "invalid permission")); case EZFS_BADPERMSET: return (dgettext(TEXT_DOMAIN, "invalid permission set name")); case EZFS_NODELEGATION: return (dgettext(TEXT_DOMAIN, "delegated administration is " "disabled on pool")); case EZFS_BADCACHE: return (dgettext(TEXT_DOMAIN, "invalid or missing cache file")); case EZFS_ISL2CACHE: return (dgettext(TEXT_DOMAIN, "device is in use as a cache")); case EZFS_VDEVNOTSUP: return (dgettext(TEXT_DOMAIN, "vdev specification is not " "supported")); case EZFS_NOTSUP: return (dgettext(TEXT_DOMAIN, "operation not supported " "on this dataset")); case EZFS_IOC_NOTSUPPORTED: return (dgettext(TEXT_DOMAIN, "operation not supported by " "zfs kernel module")); case EZFS_ACTIVE_SPARE: return (dgettext(TEXT_DOMAIN, "pool has active shared spare " "device")); case EZFS_UNPLAYED_LOGS: return (dgettext(TEXT_DOMAIN, "log device has unplayed intent " "logs")); case EZFS_REFTAG_RELE: return (dgettext(TEXT_DOMAIN, "no such tag on this dataset")); case EZFS_REFTAG_HOLD: return (dgettext(TEXT_DOMAIN, "tag already exists on this " "dataset")); case EZFS_TAGTOOLONG: return (dgettext(TEXT_DOMAIN, "tag too long")); case EZFS_PIPEFAILED: return (dgettext(TEXT_DOMAIN, "pipe create failed")); case EZFS_THREADCREATEFAILED: return (dgettext(TEXT_DOMAIN, "thread create failed")); case EZFS_POSTSPLIT_ONLINE: return (dgettext(TEXT_DOMAIN, "disk was split from this pool " "into a new one")); case EZFS_SCRUB_PAUSED: return (dgettext(TEXT_DOMAIN, "scrub is paused; " "use 'zpool scrub' to resume scrub")); case EZFS_SCRUB_PAUSED_TO_CANCEL: return (dgettext(TEXT_DOMAIN, "scrub is paused; " "use 'zpool scrub' to resume or 'zpool scrub -s' to " "cancel scrub")); case EZFS_SCRUBBING: return (dgettext(TEXT_DOMAIN, "currently scrubbing; " "use 'zpool scrub -s' to cancel scrub")); case EZFS_ERRORSCRUBBING: return (dgettext(TEXT_DOMAIN, "currently error scrubbing; " "use 'zpool scrub -s' to cancel error scrub")); case EZFS_ERRORSCRUB_PAUSED: return (dgettext(TEXT_DOMAIN, "error scrub is paused; " "use 'zpool scrub -e' to resume error scrub")); case EZFS_NO_SCRUB: return (dgettext(TEXT_DOMAIN, "there is no active scrub")); case EZFS_DIFF: return (dgettext(TEXT_DOMAIN, "unable to generate diffs")); case EZFS_DIFFDATA: return (dgettext(TEXT_DOMAIN, "invalid diff data")); case EZFS_POOLREADONLY: return (dgettext(TEXT_DOMAIN, "pool is read-only")); case EZFS_NO_PENDING: return (dgettext(TEXT_DOMAIN, "operation is not " "in progress")); case EZFS_CHECKPOINT_EXISTS: return (dgettext(TEXT_DOMAIN, "checkpoint exists")); case EZFS_DISCARDING_CHECKPOINT: return (dgettext(TEXT_DOMAIN, "currently discarding " "checkpoint")); case EZFS_NO_CHECKPOINT: return (dgettext(TEXT_DOMAIN, "checkpoint does not exist")); case EZFS_DEVRM_IN_PROGRESS: return (dgettext(TEXT_DOMAIN, "device removal in progress")); case EZFS_VDEV_TOO_BIG: return (dgettext(TEXT_DOMAIN, "device exceeds supported size")); case EZFS_ACTIVE_POOL: return (dgettext(TEXT_DOMAIN, "pool is imported on a " "different host")); case EZFS_CRYPTOFAILED: return (dgettext(TEXT_DOMAIN, "encryption failure")); case EZFS_TOOMANY: return (dgettext(TEXT_DOMAIN, "argument list too long")); case EZFS_INITIALIZING: return (dgettext(TEXT_DOMAIN, "currently initializing")); case EZFS_NO_INITIALIZE: return (dgettext(TEXT_DOMAIN, "there is no active " "initialization")); case EZFS_WRONG_PARENT: return (dgettext(TEXT_DOMAIN, "invalid parent dataset")); case EZFS_TRIMMING: return (dgettext(TEXT_DOMAIN, "currently trimming")); case EZFS_NO_TRIM: return (dgettext(TEXT_DOMAIN, "there is no active trim")); case EZFS_TRIM_NOTSUP: return (dgettext(TEXT_DOMAIN, "trim operations are not " "supported by this device")); case EZFS_NO_RESILVER_DEFER: return (dgettext(TEXT_DOMAIN, "this action requires the " "resilver_defer feature")); case EZFS_EXPORT_IN_PROGRESS: return (dgettext(TEXT_DOMAIN, "pool export in progress")); case EZFS_REBUILDING: return (dgettext(TEXT_DOMAIN, "currently sequentially " "resilvering")); case EZFS_VDEV_NOTSUP: return (dgettext(TEXT_DOMAIN, "operation not supported " "on this type of vdev")); case EZFS_NOT_USER_NAMESPACE: return (dgettext(TEXT_DOMAIN, "the provided file " "was not a user namespace file")); case EZFS_RESUME_EXISTS: return (dgettext(TEXT_DOMAIN, "Resuming recv on existing " "dataset without force")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: assert(hdl->libzfs_error == 0); return (dgettext(TEXT_DOMAIN, "no error")); } } void zfs_error_aux(libzfs_handle_t *hdl, const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc), fmt, ap); hdl->libzfs_desc_active = 1; va_end(ap); } static void zfs_verror(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) { (void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action), fmt, ap); hdl->libzfs_error = error; if (hdl->libzfs_desc_active) hdl->libzfs_desc_active = 0; else hdl->libzfs_desc[0] = '\0'; if (hdl->libzfs_printerr) { if (error == EZFS_UNKNOWN) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal " "error: %s: %s\n"), hdl->libzfs_action, libzfs_error_description(hdl)); abort(); } (void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action, libzfs_error_description(hdl)); if (error == EZFS_NOMEM) exit(1); } } int zfs_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zfs_error_fmt(hdl, error, "%s", msg)); } int zfs_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); zfs_verror(hdl, error, fmt, ap); va_end(ap); return (-1); } static int zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) { switch (error) { case EPERM: case EACCES: zfs_verror(hdl, EZFS_PERM, fmt, ap); return (-1); case ECANCELED: zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap); return (-1); case EIO: zfs_verror(hdl, EZFS_IO, fmt, ap); return (-1); case EFAULT: zfs_verror(hdl, EZFS_FAULT, fmt, ap); return (-1); case EINTR: zfs_verror(hdl, EZFS_INTR, fmt, ap); return (-1); case ECKSUM: zfs_verror(hdl, EZFS_CKSUM, fmt, ap); return (-1); } return (0); } int zfs_standard_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zfs_standard_error_fmt(hdl, error, "%s", msg)); } int zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (zfs_common_error(hdl, error, fmt, ap) != 0) { va_end(ap); return (-1); } switch (error) { case ENXIO: case ENODEV: case EPIPE: zfs_verror(hdl, EZFS_IO, fmt, ap); break; case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset does not exist")); zfs_verror(hdl, EZFS_NOENT, fmt, ap); break; case ENOSPC: case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); break; case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset already exists")); zfs_verror(hdl, EZFS_EXISTS, fmt, ap); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is busy")); zfs_verror(hdl, EZFS_BUSY, fmt, ap); break; case EROFS: zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); break; case ENAMETOOLONG: zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap); break; case ENOTSUP: zfs_verror(hdl, EZFS_BADVERSION, fmt, ap); break; case EAGAIN: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool I/O is currently suspended")); zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); break; case EREMOTEIO: zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap); break; case ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE: case ZFS_ERR_IOC_CMD_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support this operation. A reboot may " "be required to enable this operation.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support an option for this operation. " "A reboot may be required to enable this option.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_REQUIRED: case ZFS_ERR_IOC_ARG_BADTYPE: zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_WRONG_PARENT: zfs_verror(hdl, EZFS_WRONG_PARENT, fmt, ap); break; case ZFS_ERR_BADPROP: zfs_verror(hdl, EZFS_BADPROP, fmt, ap); break; case ZFS_ERR_NOT_USER_NAMESPACE: zfs_verror(hdl, EZFS_NOT_USER_NAMESPACE, fmt, ap); break; default: zfs_error_aux(hdl, "%s", strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); break; } va_end(ap); return (-1); } void zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err, char *errbuf) { switch (err) { case ENOSPC: /* * For quotas and reservations, ENOSPC indicates * something different; setting a quota or reservation * doesn't use any disk space. */ switch (prop) { case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "size is less than current used or " "reserved space")); (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); break; case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "size is greater than available space")); (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); break; default: (void) zfs_standard_error(hdl, err, errbuf); break; } break; case EBUSY: (void) zfs_standard_error(hdl, EBUSY, errbuf); break; case EROFS: (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf); break; case E2BIG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property value too long")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool and or dataset must be upgraded to set this " "property or value")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case ERANGE: if (prop == ZFS_PROP_COMPRESSION || prop == ZFS_PROP_DNODESIZE || prop == ZFS_PROP_RECORDSIZE) { (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property setting is not allowed on " "bootable datasets")); (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); } else if (prop == ZFS_PROP_CHECKSUM || prop == ZFS_PROP_DEDUP) { (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property setting is not allowed on " "root pools")); (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); } else { (void) zfs_standard_error(hdl, err, errbuf); } break; case EINVAL: if (prop == ZPROP_INVAL) { (void) zfs_error(hdl, EZFS_BADPROP, errbuf); } else { (void) zfs_standard_error(hdl, err, errbuf); } break; case ZFS_ERR_BADPROP: (void) zfs_error(hdl, EZFS_BADPROP, errbuf); break; case EACCES: if (prop == ZFS_PROP_KEYLOCATION) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation may only be set on encryption roots")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); } else { (void) zfs_standard_error(hdl, err, errbuf); } break; case EOVERFLOW: /* * This platform can't address a volume this big. */ #ifdef _ILP32 if (prop == ZFS_PROP_VOLSIZE) { (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf); break; } zfs_fallthrough; #endif default: (void) zfs_standard_error(hdl, err, errbuf); } } int zpool_standard_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zpool_standard_error_fmt(hdl, error, "%s", msg)); } int zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (zfs_common_error(hdl, error, fmt, ap) != 0) { va_end(ap); return (-1); } switch (error) { case ENODEV: zfs_verror(hdl, EZFS_NODEVICE, fmt, ap); break; case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool or dataset")); zfs_verror(hdl, EZFS_NOENT, fmt, ap); break; case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool already exists")); zfs_verror(hdl, EZFS_EXISTS, fmt, ap); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy")); zfs_verror(hdl, EZFS_BUSY, fmt, ap); break; /* There is no pending operation to cancel */ case ENOTACTIVE: zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap); break; case ENXIO: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is currently unavailable")); zfs_verror(hdl, EZFS_BADDEV, fmt, ap); break; case ENAMETOOLONG: zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap); break; case ENOTSUP: zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap); break; case EINVAL: zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap); break; case ENOSPC: case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); break; case EAGAIN: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool I/O is currently suspended")); zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); break; case EROFS: zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); break; case EDOM: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "block size out of range or does not match")); zfs_verror(hdl, EZFS_BADPROP, fmt, ap); break; case EREMOTEIO: zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap); break; case ZFS_ERR_CHECKPOINT_EXISTS: zfs_verror(hdl, EZFS_CHECKPOINT_EXISTS, fmt, ap); break; case ZFS_ERR_DISCARDING_CHECKPOINT: zfs_verror(hdl, EZFS_DISCARDING_CHECKPOINT, fmt, ap); break; case ZFS_ERR_NO_CHECKPOINT: zfs_verror(hdl, EZFS_NO_CHECKPOINT, fmt, ap); break; case ZFS_ERR_DEVRM_IN_PROGRESS: zfs_verror(hdl, EZFS_DEVRM_IN_PROGRESS, fmt, ap); break; case ZFS_ERR_VDEV_TOO_BIG: zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap); break; case ZFS_ERR_EXPORT_IN_PROGRESS: zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap); break; case ZFS_ERR_RESILVER_IN_PROGRESS: zfs_verror(hdl, EZFS_RESILVERING, fmt, ap); break; case ZFS_ERR_REBUILD_IN_PROGRESS: zfs_verror(hdl, EZFS_REBUILDING, fmt, ap); break; case ZFS_ERR_BADPROP: zfs_verror(hdl, EZFS_BADPROP, fmt, ap); break; case ZFS_ERR_VDEV_NOTSUP: zfs_verror(hdl, EZFS_VDEV_NOTSUP, fmt, ap); break; case ZFS_ERR_IOC_CMD_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support this operation. A reboot may " "be required to enable this operation.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support an option for this operation. " "A reboot may be required to enable this option.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_REQUIRED: case ZFS_ERR_IOC_ARG_BADTYPE: zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; default: zfs_error_aux(hdl, "%s", strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); } va_end(ap); return (-1); } /* * Display an out of memory error message and abort the current program. */ int no_memory(libzfs_handle_t *hdl) { return (zfs_error(hdl, EZFS_NOMEM, "internal error")); } /* * A safe form of malloc() which will die if the allocation fails. */ void * zfs_alloc(libzfs_handle_t *hdl, size_t size) { void *data; if ((data = calloc(1, size)) == NULL) (void) no_memory(hdl); return (data); } /* * A safe form of asprintf() which will die if the allocation fails. */ char * zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...) { va_list ap; char *ret; int err; va_start(ap, fmt); err = vasprintf(&ret, fmt, ap); va_end(ap); if (err < 0) { (void) no_memory(hdl); ret = NULL; } return (ret); } /* * A safe form of realloc(), which also zeroes newly allocated space. */ void * zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize) { void *ret; if ((ret = realloc(ptr, newsize)) == NULL) { (void) no_memory(hdl); return (NULL); } memset((char *)ret + oldsize, 0, newsize - oldsize); return (ret); } /* * A safe form of strdup() which will die if the allocation fails. */ char * zfs_strdup(libzfs_handle_t *hdl, const char *str) { char *ret; if ((ret = strdup(str)) == NULL) (void) no_memory(hdl); return (ret); } void libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr) { hdl->libzfs_printerr = printerr; } /* * Read lines from an open file descriptor and store them in an array of * strings until EOF. lines[] will be allocated and populated with all the * lines read. All newlines are replaced with NULL terminators for * convenience. lines[] must be freed after use with libzfs_free_str_array(). * * Returns the number of lines read. */ static int libzfs_read_stdout_from_fd(int fd, char **lines[]) { FILE *fp; int lines_cnt = 0; size_t len = 0; char *line = NULL; char **tmp_lines = NULL, **tmp; fp = fdopen(fd, "r"); if (fp == NULL) { close(fd); return (0); } while (getline(&line, &len, fp) != -1) { tmp = realloc(tmp_lines, sizeof (*tmp_lines) * (lines_cnt + 1)); if (tmp == NULL) { /* Return the lines we were able to process */ break; } tmp_lines = tmp; /* Remove newline if not EOF */ if (line[strlen(line) - 1] == '\n') line[strlen(line) - 1] = '\0'; tmp_lines[lines_cnt] = strdup(line); if (tmp_lines[lines_cnt] == NULL) break; ++lines_cnt; } free(line); fclose(fp); *lines = tmp_lines; return (lines_cnt); } static int libzfs_run_process_impl(const char *path, char *argv[], char *env[], int flags, char **lines[], int *lines_cnt) { pid_t pid; int error, devnull_fd; int link[2]; /* * Setup a pipe between our child and parent process if we're * reading stdout. */ if (lines != NULL && pipe2(link, O_NONBLOCK | O_CLOEXEC) == -1) return (-EPIPE); pid = fork(); if (pid == 0) { /* Child process */ devnull_fd = open("/dev/null", O_WRONLY | O_CLOEXEC); if (devnull_fd < 0) _exit(-1); if (!(flags & STDOUT_VERBOSE) && (lines == NULL)) (void) dup2(devnull_fd, STDOUT_FILENO); else if (lines != NULL) { /* Save the output to lines[] */ dup2(link[1], STDOUT_FILENO); } if (!(flags & STDERR_VERBOSE)) (void) dup2(devnull_fd, STDERR_FILENO); if (flags & NO_DEFAULT_PATH) { if (env == NULL) execv(path, argv); else execve(path, argv, env); } else { if (env == NULL) execvp(path, argv); else execvpe(path, argv, env); } _exit(-1); } else if (pid > 0) { /* Parent process */ int status; while ((error = waitpid(pid, &status, 0)) == -1 && errno == EINTR) ; if (error < 0 || !WIFEXITED(status)) return (-1); if (lines != NULL) { close(link[1]); *lines_cnt = libzfs_read_stdout_from_fd(link[0], lines); } return (WEXITSTATUS(status)); } return (-1); } int libzfs_run_process(const char *path, char *argv[], int flags) { return (libzfs_run_process_impl(path, argv, NULL, flags, NULL, NULL)); } /* * Run a command and store its stdout lines in an array of strings (lines[]). * lines[] is allocated and populated for you, and the number of lines is set in * lines_cnt. lines[] must be freed after use with libzfs_free_str_array(). * All newlines (\n) in lines[] are terminated for convenience. */ int libzfs_run_process_get_stdout(const char *path, char *argv[], char *env[], char **lines[], int *lines_cnt) { return (libzfs_run_process_impl(path, argv, env, 0, lines, lines_cnt)); } /* * Same as libzfs_run_process_get_stdout(), but run without $PATH set. This * means that *path needs to be the full path to the executable. */ int libzfs_run_process_get_stdout_nopath(const char *path, char *argv[], char *env[], char **lines[], int *lines_cnt) { return (libzfs_run_process_impl(path, argv, env, NO_DEFAULT_PATH, lines, lines_cnt)); } /* * Free an array of strings. Free both the strings contained in the array and * the array itself. */ void libzfs_free_str_array(char **strs, int count) { while (--count >= 0) free(strs[count]); free(strs); } /* * Returns 1 if environment variable is set to "YES", "yes", "ON", "on", or * a non-zero number. * * Returns 0 otherwise. */ boolean_t libzfs_envvar_is_set(const char *envvar) { char *env = getenv(envvar); return (env && (strtoul(env, NULL, 0) > 0 || (!strncasecmp(env, "YES", 3) && strnlen(env, 4) == 3) || (!strncasecmp(env, "ON", 2) && strnlen(env, 3) == 2))); } libzfs_handle_t * libzfs_init(void) { libzfs_handle_t *hdl; int error; char *env; if ((error = libzfs_load_module()) != 0) { errno = error; return (NULL); } if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) { return (NULL); } if (regcomp(&hdl->libzfs_urire, URI_REGEX, 0) != 0) { free(hdl); return (NULL); } if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR|O_EXCL|O_CLOEXEC)) < 0) { free(hdl); return (NULL); } if (libzfs_core_init() != 0) { (void) close(hdl->libzfs_fd); free(hdl); return (NULL); } zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); vdev_prop_init(); libzfs_mnttab_init(hdl); fletcher_4_init(); if (getenv("ZFS_PROP_DEBUG") != NULL) { hdl->libzfs_prop_debug = B_TRUE; } if ((env = getenv("ZFS_SENDRECV_MAX_NVLIST")) != NULL) { if ((error = zfs_nicestrtonum(hdl, env, &hdl->libzfs_max_nvlist))) { errno = error; (void) close(hdl->libzfs_fd); free(hdl); return (NULL); } } else { hdl->libzfs_max_nvlist = (SPA_MAXBLOCKSIZE * 4); } /* * For testing, remove some settable properties and features */ if (libzfs_envvar_is_set("ZFS_SYSFS_PROP_SUPPORT_TEST")) { zprop_desc_t *proptbl; proptbl = zpool_prop_get_table(); proptbl[ZPOOL_PROP_COMMENT].pd_zfs_mod_supported = B_FALSE; proptbl = zfs_prop_get_table(); proptbl[ZFS_PROP_DNODESIZE].pd_zfs_mod_supported = B_FALSE; zfeature_info_t *ftbl = spa_feature_table; ftbl[SPA_FEATURE_LARGE_BLOCKS].fi_zfs_mod_supported = B_FALSE; } return (hdl); } void libzfs_fini(libzfs_handle_t *hdl) { (void) close(hdl->libzfs_fd); zpool_free_handles(hdl); namespace_clear(hdl); libzfs_mnttab_fini(hdl); libzfs_core_fini(); regfree(&hdl->libzfs_urire); fletcher_4_fini(); #if LIBFETCH_DYNAMIC if (hdl->libfetch != (void *)-1 && hdl->libfetch != NULL) (void) dlclose(hdl->libfetch); free(hdl->libfetch_load_error); #endif free(hdl); } libzfs_handle_t * zpool_get_handle(zpool_handle_t *zhp) { return (zhp->zpool_hdl); } libzfs_handle_t * zfs_get_handle(zfs_handle_t *zhp) { return (zhp->zfs_hdl); } zpool_handle_t * zfs_get_pool_handle(const zfs_handle_t *zhp) { return (zhp->zpool_hdl); } /* * Given a name, determine whether or not it's a valid path * (starts with '/' or "./"). If so, walk the mnttab trying * to match the device number. If not, treat the path as an * fs/vol/snap/bkmark name. */ zfs_handle_t * zfs_path_to_zhandle(libzfs_handle_t *hdl, const char *path, zfs_type_t argtype) { struct stat64 statbuf; struct extmnttab entry; if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) { /* * It's not a valid path, assume it's a name of type 'argtype'. */ return (zfs_open(hdl, path, argtype)); } if (getextmntent(path, &entry, &statbuf) != 0) return (NULL); if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"), path); return (NULL); } return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM)); } /* * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from * an ioctl(). */ void zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) { if (len == 0) len = 256 * 1024; zc->zc_nvlist_dst_size = len; zc->zc_nvlist_dst = (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); } /* * Called when an ioctl() which returns an nvlist fails with ENOMEM. This will * expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was * filled in by the kernel to indicate the actual required size. */ void zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc) { free((void *)(uintptr_t)zc->zc_nvlist_dst); zc->zc_nvlist_dst = (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); } /* * Called to free the src and dst nvlists stored in the command structure. */ void zcmd_free_nvlists(zfs_cmd_t *zc) { free((void *)(uintptr_t)zc->zc_nvlist_conf); free((void *)(uintptr_t)zc->zc_nvlist_src); free((void *)(uintptr_t)zc->zc_nvlist_dst); zc->zc_nvlist_conf = 0; zc->zc_nvlist_src = 0; zc->zc_nvlist_dst = 0; } static void zcmd_write_nvlist_com(libzfs_handle_t *hdl, uint64_t *outnv, uint64_t *outlen, nvlist_t *nvl) { char *packed; size_t len = fnvlist_size(nvl); packed = zfs_alloc(hdl, len); verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0); *outnv = (uint64_t)(uintptr_t)packed; *outlen = len; } void zcmd_write_conf_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) { zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf, &zc->zc_nvlist_conf_size, nvl); } void zcmd_write_src_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) { zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src, &zc->zc_nvlist_src_size, nvl); } /* * Unpacks an nvlist from the ZFS ioctl command structure. */ int zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp) { if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size, nvlp, 0) != 0) return (no_memory(hdl)); return (0); } /* * ================================================================ * API shared by zfs and zpool property management * ================================================================ */ static void zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) { zprop_list_t *pl; int i; char *title; size_t len; cbp->cb_first = B_FALSE; if (cbp->cb_scripted) return; /* * Start with the length of the column headers. */ cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME")); cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN, "PROPERTY")); cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN, "VALUE")); cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN, "RECEIVED")); cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN, "SOURCE")); /* first property is always NAME */ assert(cbp->cb_proplist->pl_prop == ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ((type == ZFS_TYPE_VDEV) ? VDEV_PROP_NAME : ZFS_PROP_NAME))); /* * Go through and calculate the widths for each column. For the * 'source' column, we kludge it up by taking the worst-case scenario of * inheriting from the longest name. This is acceptable because in the * majority of cases 'SOURCE' is the last column displayed, and we don't * use the width anyway. Note that the 'VALUE' column can be oversized, * if the name of the property is much longer than any values we find. */ for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * 'PROPERTY' column */ if (pl->pl_prop != ZPROP_USERPROP) { const char *propname = (type == ZFS_TYPE_POOL) ? zpool_prop_to_name(pl->pl_prop) : ((type == ZFS_TYPE_VDEV) ? vdev_prop_to_name(pl->pl_prop) : zfs_prop_to_name(pl->pl_prop)); assert(propname != NULL); len = strlen(propname); if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) cbp->cb_colwidths[GET_COL_PROPERTY] = len; } else { assert(pl->pl_user_prop != NULL); len = strlen(pl->pl_user_prop); if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) cbp->cb_colwidths[GET_COL_PROPERTY] = len; } /* * 'VALUE' column. The first property is always the 'name' * property that was tacked on either by /sbin/zfs's * zfs_do_get() or when calling zprop_expand_list(), so we * ignore its width. If the user specified the name property * to display, then it will be later in the list in any case. */ if (pl != cbp->cb_proplist && pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE]) cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width; /* 'RECEIVED' column. */ if (pl != cbp->cb_proplist && pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD]) cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width; /* * 'NAME' and 'SOURCE' columns */ if (pl->pl_prop == ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ((type == ZFS_TYPE_VDEV) ? VDEV_PROP_NAME : ZFS_PROP_NAME)) && pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) { cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width; cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width + strlen(dgettext(TEXT_DOMAIN, "inherited from")); } } /* * Now go through and print the headers. */ for (i = 0; i < ZFS_GET_NCOLS; i++) { switch (cbp->cb_columns[i]) { case GET_COL_NAME: title = dgettext(TEXT_DOMAIN, "NAME"); break; case GET_COL_PROPERTY: title = dgettext(TEXT_DOMAIN, "PROPERTY"); break; case GET_COL_VALUE: title = dgettext(TEXT_DOMAIN, "VALUE"); break; case GET_COL_RECVD: title = dgettext(TEXT_DOMAIN, "RECEIVED"); break; case GET_COL_SOURCE: title = dgettext(TEXT_DOMAIN, "SOURCE"); break; default: title = NULL; } if (title != NULL) { if (i == (ZFS_GET_NCOLS - 1) || cbp->cb_columns[i + 1] == GET_COL_NONE) (void) printf("%s", title); else (void) printf("%-*s ", cbp->cb_colwidths[cbp->cb_columns[i]], title); } } (void) printf("\n"); } /* * Display a single line of output, according to the settings in the callback * structure. */ void zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp, const char *propname, const char *value, zprop_source_t sourcetype, const char *source, const char *recvd_value) { int i; const char *str = NULL; char buf[128]; /* * Ignore those source types that the user has chosen to ignore. */ if ((sourcetype & cbp->cb_sources) == 0) return; if (cbp->cb_first) zprop_print_headers(cbp, cbp->cb_type); for (i = 0; i < ZFS_GET_NCOLS; i++) { switch (cbp->cb_columns[i]) { case GET_COL_NAME: str = name; break; case GET_COL_PROPERTY: str = propname; break; case GET_COL_VALUE: str = value; break; case GET_COL_SOURCE: switch (sourcetype) { case ZPROP_SRC_NONE: str = "-"; break; case ZPROP_SRC_DEFAULT: str = "default"; break; case ZPROP_SRC_LOCAL: str = "local"; break; case ZPROP_SRC_TEMPORARY: str = "temporary"; break; case ZPROP_SRC_INHERITED: (void) snprintf(buf, sizeof (buf), "inherited from %s", source); str = buf; break; case ZPROP_SRC_RECEIVED: str = "received"; break; default: str = NULL; assert(!"unhandled zprop_source_t"); } break; case GET_COL_RECVD: str = (recvd_value == NULL ? "-" : recvd_value); break; default: continue; } if (i == (ZFS_GET_NCOLS - 1) || cbp->cb_columns[i + 1] == GET_COL_NONE) (void) printf("%s", str); else if (cbp->cb_scripted) (void) printf("%s\t", str); else (void) printf("%-*s ", cbp->cb_colwidths[cbp->cb_columns[i]], str); } (void) printf("\n"); } /* * Given a numeric suffix, convert the value into a number of bits that the * resulting value must be shifted. */ static int str2shift(libzfs_handle_t *hdl, const char *buf) { const char *ends = "BKMGTPEZ"; int i; if (buf[0] == '\0') return (0); for (i = 0; i < strlen(ends); i++) { if (toupper(buf[0]) == ends[i]) break; } if (i == strlen(ends)) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid numeric suffix '%s'"), buf); return (-1); } /* * Allow 'G' = 'GB' = 'GiB', case-insensitively. * However, 'BB' and 'BiB' are disallowed. */ if (buf[1] == '\0' || (toupper(buf[0]) != 'B' && ((toupper(buf[1]) == 'B' && buf[2] == '\0') || (toupper(buf[1]) == 'I' && toupper(buf[2]) == 'B' && buf[3] == '\0')))) return (10 * i); if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid numeric suffix '%s'"), buf); return (-1); } /* * Convert a string of the form '100G' into a real number. Used when setting * properties or creating a volume. 'buf' is used to place an extended error * message for the caller to use. */ int zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num) { char *end; int shift; *num = 0; /* Check to see if this looks like a number. */ if ((value[0] < '0' || value[0] > '9') && value[0] != '.') { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bad numeric value '%s'"), value); return (-1); } /* Rely on strtoull() to process the numeric portion. */ errno = 0; *num = strtoull(value, &end, 10); /* * Check for ERANGE, which indicates that the value is too large to fit * in a 64-bit value. */ if (errno == ERANGE) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } /* * If we have a decimal value, then do the computation with floating * point arithmetic. Otherwise, use standard arithmetic. */ if (*end == '.') { double fval = strtod(value, &end); if ((shift = str2shift(hdl, end)) == -1) return (-1); fval *= pow(2, shift); /* * UINT64_MAX is not exactly representable as a double. * The closest representation is UINT64_MAX + 1, so we * use a >= comparison instead of > for the bounds check. */ if (fval >= (double)UINT64_MAX) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } *num = (uint64_t)fval; } else { if ((shift = str2shift(hdl, end)) == -1) return (-1); /* Check for overflow */ if (shift >= 64 || (*num << shift) >> shift != *num) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } *num <<= shift; } return (0); } /* * Given a propname=value nvpair to set, parse any numeric properties * (index, boolean, etc) if they are specified as strings and add the * resulting nvpair to the returned nvlist. * * At the DSL layer, all properties are either 64-bit numbers or strings. * We want the user to be able to ignore this fact and specify properties * as native values (numbers, for example) or as strings (to simplify * command line utilities). This also handles converting index types * (compression, checksum, etc) from strings to their on-disk index. */ int zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, zfs_type_t type, nvlist_t *ret, const char **svalp, uint64_t *ivalp, const char *errbuf) { data_type_t datatype = nvpair_type(elem); zprop_type_t proptype; const char *propname; const char *value; boolean_t isnone = B_FALSE; boolean_t isauto = B_FALSE; int err = 0; if (type == ZFS_TYPE_POOL) { proptype = zpool_prop_get_type(prop); propname = zpool_prop_to_name(prop); } else if (type == ZFS_TYPE_VDEV) { proptype = vdev_prop_get_type(prop); propname = vdev_prop_to_name(prop); } else { proptype = zfs_prop_get_type(prop); propname = zfs_prop_to_name(prop); } /* * Convert any properties to the internal DSL value types. */ *svalp = NULL; *ivalp = 0; switch (proptype) { case PROP_TYPE_STRING: if (datatype != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), nvpair_name(elem)); goto error; } err = nvpair_value_string(elem, svalp); if (err != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is invalid"), nvpair_name(elem)); goto error; } if (strlen(*svalp) >= ZFS_MAXPROPLEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is too long"), nvpair_name(elem)); goto error; } break; case PROP_TYPE_NUMBER: if (datatype == DATA_TYPE_STRING) { (void) nvpair_value_string(elem, &value); if (strcmp(value, "none") == 0) { isnone = B_TRUE; } else if (strcmp(value, "auto") == 0) { isauto = B_TRUE; } else if (zfs_nicestrtonum(hdl, value, ivalp) != 0) { goto error; } } else if (datatype == DATA_TYPE_UINT64) { (void) nvpair_value_uint64(elem, ivalp); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a number"), nvpair_name(elem)); goto error; } /* * Quota special: force 'none' and don't allow 0. */ if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone && (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "use 'none' to disable quota/refquota")); goto error; } /* * Special handling for "*_limit=none". In this case it's not * 0 but UINT64_MAX. */ if ((type & ZFS_TYPE_DATASET) && isnone && (prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT)) { *ivalp = UINT64_MAX; } /* * Special handling for "checksum_*=none". In this case it's not * 0 but UINT64_MAX. */ if ((type & ZFS_TYPE_VDEV) && isnone && (prop == VDEV_PROP_CHECKSUM_N || prop == VDEV_PROP_CHECKSUM_T || prop == VDEV_PROP_IO_N || prop == VDEV_PROP_IO_T)) { *ivalp = UINT64_MAX; } /* * Special handling for setting 'refreservation' to 'auto'. Use * UINT64_MAX to tell the caller to use zfs_fix_auto_resv(). * 'auto' is only allowed on volumes. */ if (isauto) { switch (prop) { case ZFS_PROP_REFRESERVATION: if ((type & ZFS_TYPE_VOLUME) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s=auto' only allowed on " "volumes"), nvpair_name(elem)); goto error; } *ivalp = UINT64_MAX; break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'auto' is invalid value for '%s'"), nvpair_name(elem)); goto error; } } break; case PROP_TYPE_INDEX: if (datatype != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), nvpair_name(elem)); goto error; } (void) nvpair_value_string(elem, &value); if (zprop_string_to_index(prop, value, ivalp, type) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be one of '%s'"), propname, zprop_values(prop, type)); goto error; } break; default: abort(); } /* * Add the result to our return set of properties. */ if (*svalp != NULL) { if (nvlist_add_string(ret, propname, *svalp) != 0) { (void) no_memory(hdl); return (-1); } } else { if (nvlist_add_uint64(ret, propname, *ivalp) != 0) { (void) no_memory(hdl); return (-1); } } return (0); error: (void) zfs_error(hdl, EZFS_BADPROP, errbuf); return (-1); } static int addlist(libzfs_handle_t *hdl, const char *propname, zprop_list_t **listp, zfs_type_t type) { int prop = zprop_name_to_prop(propname, type); if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type, B_FALSE)) prop = ZPROP_INVAL; /* * Return failure if no property table entry was found and this isn't * a user-defined property. */ if (prop == ZPROP_USERPROP && ((type == ZFS_TYPE_POOL && !zfs_prop_user(propname) && !zpool_prop_feature(propname) && !zpool_prop_unsupported(propname)) || ((type == ZFS_TYPE_DATASET) && !zfs_prop_user(propname) && !zfs_prop_userquota(propname) && !zfs_prop_written(propname)) || ((type == ZFS_TYPE_VDEV) && !vdev_prop_user(propname)))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "bad property list"))); } zprop_list_t *entry = zfs_alloc(hdl, sizeof (*entry)); entry->pl_prop = prop; if (prop == ZPROP_USERPROP) { entry->pl_user_prop = zfs_strdup(hdl, propname); entry->pl_width = strlen(propname); } else { entry->pl_width = zprop_width(prop, &entry->pl_fixed, type); } *listp = entry; return (0); } /* * Given a comma-separated list of properties, construct a property list * containing both user-defined and native properties. This function will * return a NULL list if 'all' is specified, which can later be expanded * by zprop_expand_list(). */ int zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp, zfs_type_t type) { *listp = NULL; /* * If 'all' is specified, return a NULL list. */ if (strcmp(props, "all") == 0) return (0); /* * If no props were specified, return an error. */ if (props[0] == '\0') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no properties specified")); return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "bad property list"))); } for (char *p; (p = strsep(&props, ",")); ) if (strcmp(p, "space") == 0) { static const char *const spaceprops[] = { "name", "avail", "used", "usedbysnapshots", "usedbydataset", "usedbyrefreservation", "usedbychildren" }; for (int i = 0; i < ARRAY_SIZE(spaceprops); i++) { if (addlist(hdl, spaceprops[i], listp, type)) return (-1); listp = &(*listp)->pl_next; } } else { if (addlist(hdl, p, listp, type)) return (-1); listp = &(*listp)->pl_next; } return (0); } void zprop_free_list(zprop_list_t *pl) { zprop_list_t *next; while (pl != NULL) { next = pl->pl_next; free(pl->pl_user_prop); free(pl); pl = next; } } typedef struct expand_data { zprop_list_t **last; libzfs_handle_t *hdl; zfs_type_t type; } expand_data_t; static int zprop_expand_list_cb(int prop, void *cb) { zprop_list_t *entry; expand_data_t *edp = cb; entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t)); entry->pl_prop = prop; entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type); entry->pl_all = B_TRUE; *(edp->last) = entry; edp->last = &entry->pl_next; return (ZPROP_CONT); } int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, zfs_type_t type) { zprop_list_t *entry; zprop_list_t **last; expand_data_t exp; if (*plp == NULL) { /* * If this is the very first time we've been called for an 'all' * specification, expand the list to include all native * properties. */ last = plp; exp.last = last; exp.hdl = hdl; exp.type = type; if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE, B_FALSE, type) == ZPROP_INVAL) return (-1); /* * Add 'name' to the beginning of the list, which is handled * specially. */ entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ((type == ZFS_TYPE_VDEV) ? VDEV_PROP_NAME : ZFS_PROP_NAME)); entry->pl_width = zprop_width(entry->pl_prop, &entry->pl_fixed, type); entry->pl_all = B_TRUE; entry->pl_next = *plp; *plp = entry; } return (0); } int zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type) { return (zprop_iter_common(func, cb, show_all, ordered, type)); } const char * zfs_version_userland(void) { return (ZFS_META_ALIAS); } /* * Prints both zfs userland and kernel versions * Returns 0 on success, and -1 on error */ int zfs_version_print(void) { (void) puts(ZFS_META_ALIAS); char *kver = zfs_version_kernel(); if (kver == NULL) { fprintf(stderr, "zfs_version_kernel() failed: %s\n", strerror(errno)); return (-1); } (void) printf("zfs-kmod-%s\n", kver); free(kver); return (0); } /* * Return 1 if the user requested ANSI color output, and our terminal supports * it. Return 0 for no color. */ int use_color(void) { static int use_color = -1; char *term; /* * Optimization: * * For each zpool invocation, we do a single check to see if we should * be using color or not, and cache that value for the lifetime of the * the zpool command. That makes it cheap to call use_color() when * we're printing with color. We assume that the settings are not going * to change during the invocation of a zpool command (the user isn't * going to change the ZFS_COLOR value while zpool is running, for * example). */ if (use_color != -1) { /* * We've already figured out if we should be using color or * not. Return the cached value. */ return (use_color); } term = getenv("TERM"); /* * The user sets the ZFS_COLOR env var set to enable zpool ANSI color * output. However if NO_COLOR is set (https://no-color.org/) then * don't use it. Also, don't use color if terminal doesn't support * it. */ if (libzfs_envvar_is_set("ZFS_COLOR") && !libzfs_envvar_is_set("NO_COLOR") && isatty(STDOUT_FILENO) && term && strcmp("dumb", term) != 0 && strcmp("unknown", term) != 0) { /* Color supported */ use_color = 1; } else { use_color = 0; } return (use_color); } /* * The functions color_start() and color_end() are used for when you want * to colorize a block of text. * * For example: * color_start(ANSI_RED) * printf("hello"); * printf("world"); * color_end(); */ void color_start(const char *color) { if (color && use_color()) { fputs(color, stdout); fflush(stdout); } } void color_end(void) { if (use_color()) { fputs(ANSI_RESET, stdout); fflush(stdout); } } /* * printf() with a color. If color is NULL, then do a normal printf. */ int printf_color(const char *color, const char *format, ...) { va_list aptr; int rc; if (color) color_start(color); va_start(aptr, format); rc = vprintf(format, aptr); va_end(aptr); if (color) color_end(); return (rc); } + +/* PATH + 5 env vars + a NULL entry = 7 */ +#define ZPOOL_VDEV_SCRIPT_ENV_COUNT 7 + +/* + * There's a few places where ZFS will call external scripts (like the script + * in zpool.d/ and `zfs_prepare_disk`). These scripts are called with a + * reduced $PATH, and some vdev specific environment vars set. This function + * will allocate an populate the environment variable array that is passed to + * these scripts. The user must free the arrays with zpool_vdev_free_env() when + * they are done. + * + * The following env vars will be set (but value could be blank): + * + * POOL_NAME + * VDEV_PATH + * VDEV_UPATH + * VDEV_ENC_SYSFS_PATH + * + * In addition, you can set an optional environment variable named 'opt_key' + * to 'opt_val' if you want. + * + * Returns allocated env[] array on success, NULL otherwise. + */ +char ** +zpool_vdev_script_alloc_env(const char *pool_name, + const char *vdev_path, const char *vdev_upath, + const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val) +{ + char **env = NULL; + int rc; + + env = calloc(ZPOOL_VDEV_SCRIPT_ENV_COUNT, sizeof (*env)); + if (!env) + return (NULL); + + env[0] = strdup("PATH=/bin:/sbin:/usr/bin:/usr/sbin"); + if (!env[0]) + goto error; + + /* Setup our custom environment variables */ + rc = asprintf(&env[1], "POOL_NAME=%s", pool_name ? pool_name : ""); + if (rc == -1) { + env[1] = NULL; + goto error; + } + + rc = asprintf(&env[2], "VDEV_PATH=%s", vdev_path ? vdev_path : ""); + if (rc == -1) { + env[2] = NULL; + goto error; + } + + rc = asprintf(&env[3], "VDEV_UPATH=%s", vdev_upath ? vdev_upath : ""); + if (rc == -1) { + env[3] = NULL; + goto error; + } + + rc = asprintf(&env[4], "VDEV_ENC_SYSFS_PATH=%s", + vdev_enc_sysfs_path ? vdev_enc_sysfs_path : ""); + if (rc == -1) { + env[4] = NULL; + goto error; + } + + if (opt_key != NULL) { + rc = asprintf(&env[5], "%s=%s", opt_key, + opt_val ? opt_val : ""); + if (rc == -1) { + env[5] = NULL; + goto error; + } + } + + return (env); + +error: + for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++) + free(env[i]); + + free(env); + + return (NULL); +} + +/* + * Free the env[] array that was allocated by zpool_vdev_script_alloc_env(). + */ +void +zpool_vdev_script_free_env(char **env) +{ + for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++) + free(env[i]); + + free(env); +} + +/* + * Prepare a disk by (optionally) running a program before labeling the disk. + * This can be useful for installing disk firmware or doing some pre-flight + * checks on the disk before it becomes part of the pool. The program run is + * located at ZFSEXECDIR/zfs_prepare_disk + * (E.x: /usr/local/libexec/zfs/zfs_prepare_disk). + * + * Return 0 on success, non-zero on failure. + */ +int +zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv, + const char *prepare_str, char **lines[], int *lines_cnt) +{ + const char *script_path = ZFSEXECDIR "/zfs_prepare_disk"; + const char *pool_name; + int rc = 0; + + /* Path to script and a NULL entry */ + char *argv[2] = {(char *)script_path}; + char **env = NULL; + const char *path = NULL, *enc_sysfs_path = NULL; + char *upath; + *lines_cnt = 0; + + if (access(script_path, X_OK) != 0) { + /* No script, nothing to do */ + return (0); + } + + (void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH, &path); + (void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &enc_sysfs_path); + + upath = zfs_get_underlying_path(path); + pool_name = zhp ? zpool_get_name(zhp) : NULL; + + env = zpool_vdev_script_alloc_env(pool_name, path, upath, + enc_sysfs_path, "VDEV_PREPARE", prepare_str); + + free(upath); + + if (env == NULL) { + return (ENOMEM); + } + + rc = libzfs_run_process_get_stdout(script_path, argv, env, lines, + lines_cnt); + + zpool_vdev_script_free_env(env); + + return (rc); +} + +/* + * Optionally run a script and then label a disk. The script can be used to + * prepare a disk for inclusion into the pool. For example, it might update + * the disk's firmware or check its health. + * + * The 'name' provided is the short name, stripped of any leading + * /dev path, and is passed to zpool_label_disk. vdev_nv is the nvlist for + * the vdev. prepare_str is a string that gets passed as the VDEV_PREPARE + * env variable to the script. + * + * The following env vars are passed to the script: + * + * POOL_NAME: The pool name (blank during zpool create) + * VDEV_PREPARE: Reason why the disk is being prepared for inclusion: + * "create", "add", "replace", or "autoreplace" + * VDEV_PATH: Path to the disk + * VDEV_UPATH: One of the 'underlying paths' to the disk. This is + * useful for DM devices. + * VDEV_ENC_SYSFS_PATH: Path to the disk's enclosure sysfs path, if available. + * + * Note, some of these values can be blank. + * + * Return 0 on success, non-zero otherwise. + */ +int +zpool_prepare_and_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, + const char *name, nvlist_t *vdev_nv, const char *prepare_str, + char **lines[], int *lines_cnt) +{ + int rc; + char vdev_path[MAXPATHLEN]; + (void) snprintf(vdev_path, sizeof (vdev_path), "%s/%s", DISK_ROOT, + name); + + /* zhp will be NULL when creating a pool */ + rc = zpool_prepare_disk(zhp, vdev_nv, prepare_str, lines, lines_cnt); + if (rc != 0) + return (rc); + + rc = zpool_label_disk(hdl, zhp, name); + return (rc); +} diff --git a/sys/contrib/openzfs/lib/libzpool/kernel.c b/sys/contrib/openzfs/lib/libzpool/kernel.c index a9b9bf4c2ce5..ffad7fc02bc9 100644 --- a/sys/contrib/openzfs/lib/libzpool/kernel.c +++ b/sys/contrib/openzfs/lib/libzpool/kernel.c @@ -1,1463 +1,1472 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Emulation of kernel services in userland. */ uint64_t physmem; uint32_t hostid; struct utsname hw_utsname; /* If set, all blocks read will be copied to the specified directory. */ char *vn_dumpdir = NULL; /* this only exists to have its address taken */ struct proc p0; /* * ========================================================================= * threads * ========================================================================= * * TS_STACK_MIN is dictated by the minimum allowed pthread stack size. While * TS_STACK_MAX is somewhat arbitrary, it was selected to be large enough for * the expected stack depth while small enough to avoid exhausting address * space with high thread counts. */ #define TS_STACK_MIN MAX(PTHREAD_STACK_MIN, 32768) #define TS_STACK_MAX (256 * 1024) struct zk_thread_wrapper { void (*func)(void *); void *arg; }; static void * zk_thread_wrapper(void *arg) { struct zk_thread_wrapper ztw; memcpy(&ztw, arg, sizeof (ztw)); free(arg); ztw.func(ztw.arg); return (NULL); } kthread_t * zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state) { pthread_attr_t attr; pthread_t tid; char *stkstr; struct zk_thread_wrapper *ztw; int detachstate = PTHREAD_CREATE_DETACHED; VERIFY0(pthread_attr_init(&attr)); if (state & TS_JOINABLE) detachstate = PTHREAD_CREATE_JOINABLE; VERIFY0(pthread_attr_setdetachstate(&attr, detachstate)); /* * We allow the default stack size in user space to be specified by * setting the ZFS_STACK_SIZE environment variable. This allows us * the convenience of observing and debugging stack overruns in * user space. Explicitly specified stack sizes will be honored. * The usage of ZFS_STACK_SIZE is discussed further in the * ENVIRONMENT VARIABLES sections of the ztest(1) man page. */ if (stksize == 0) { stkstr = getenv("ZFS_STACK_SIZE"); if (stkstr == NULL) stksize = TS_STACK_MAX; else stksize = MAX(atoi(stkstr), TS_STACK_MIN); } VERIFY3S(stksize, >, 0); stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE); /* * If this ever fails, it may be because the stack size is not a * multiple of system page size. */ VERIFY0(pthread_attr_setstacksize(&attr, stksize)); VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE)); VERIFY(ztw = malloc(sizeof (*ztw))); ztw->func = func; ztw->arg = arg; VERIFY0(pthread_create(&tid, &attr, zk_thread_wrapper, ztw)); VERIFY0(pthread_attr_destroy(&attr)); return ((void *)(uintptr_t)tid); } /* * ========================================================================= * kstats * ========================================================================= */ kstat_t * kstat_create(const char *module, int instance, const char *name, const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag) { (void) module, (void) instance, (void) name, (void) class, (void) type, (void) ndata, (void) ks_flag; return (NULL); } void kstat_install(kstat_t *ksp) { (void) ksp; } void kstat_delete(kstat_t *ksp) { (void) ksp; } void kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)) { (void) ksp, (void) headers, (void) data, (void) addr; } /* * ========================================================================= * mutexes * ========================================================================= */ void mutex_init(kmutex_t *mp, char *name, int type, void *cookie) { (void) name, (void) type, (void) cookie; VERIFY0(pthread_mutex_init(&mp->m_lock, NULL)); memset(&mp->m_owner, 0, sizeof (pthread_t)); } void mutex_destroy(kmutex_t *mp) { VERIFY0(pthread_mutex_destroy(&mp->m_lock)); } void mutex_enter(kmutex_t *mp) { VERIFY0(pthread_mutex_lock(&mp->m_lock)); mp->m_owner = pthread_self(); } +int +mutex_enter_check_return(kmutex_t *mp) +{ + int error = pthread_mutex_lock(&mp->m_lock); + if (error == 0) + mp->m_owner = pthread_self(); + return (error); +} + int mutex_tryenter(kmutex_t *mp) { int error = pthread_mutex_trylock(&mp->m_lock); if (error == 0) { mp->m_owner = pthread_self(); return (1); } else { VERIFY3S(error, ==, EBUSY); return (0); } } void mutex_exit(kmutex_t *mp) { memset(&mp->m_owner, 0, sizeof (pthread_t)); VERIFY0(pthread_mutex_unlock(&mp->m_lock)); } /* * ========================================================================= * rwlocks * ========================================================================= */ void rw_init(krwlock_t *rwlp, char *name, int type, void *arg) { (void) name, (void) type, (void) arg; VERIFY0(pthread_rwlock_init(&rwlp->rw_lock, NULL)); rwlp->rw_readers = 0; rwlp->rw_owner = 0; } void rw_destroy(krwlock_t *rwlp) { VERIFY0(pthread_rwlock_destroy(&rwlp->rw_lock)); } void rw_enter(krwlock_t *rwlp, krw_t rw) { if (rw == RW_READER) { VERIFY0(pthread_rwlock_rdlock(&rwlp->rw_lock)); atomic_inc_uint(&rwlp->rw_readers); } else { VERIFY0(pthread_rwlock_wrlock(&rwlp->rw_lock)); rwlp->rw_owner = pthread_self(); } } void rw_exit(krwlock_t *rwlp) { if (RW_READ_HELD(rwlp)) atomic_dec_uint(&rwlp->rw_readers); else rwlp->rw_owner = 0; VERIFY0(pthread_rwlock_unlock(&rwlp->rw_lock)); } int rw_tryenter(krwlock_t *rwlp, krw_t rw) { int error; if (rw == RW_READER) error = pthread_rwlock_tryrdlock(&rwlp->rw_lock); else error = pthread_rwlock_trywrlock(&rwlp->rw_lock); if (error == 0) { if (rw == RW_READER) atomic_inc_uint(&rwlp->rw_readers); else rwlp->rw_owner = pthread_self(); return (1); } VERIFY3S(error, ==, EBUSY); return (0); } uint32_t zone_get_hostid(void *zonep) { /* * We're emulating the system's hostid in userland. */ (void) zonep; return (hostid); } int rw_tryupgrade(krwlock_t *rwlp) { (void) rwlp; return (0); } /* * ========================================================================= * condition variables * ========================================================================= */ void cv_init(kcondvar_t *cv, char *name, int type, void *arg) { (void) name, (void) type, (void) arg; VERIFY0(pthread_cond_init(cv, NULL)); } void cv_destroy(kcondvar_t *cv) { VERIFY0(pthread_cond_destroy(cv)); } void cv_wait(kcondvar_t *cv, kmutex_t *mp) { memset(&mp->m_owner, 0, sizeof (pthread_t)); VERIFY0(pthread_cond_wait(cv, &mp->m_lock)); mp->m_owner = pthread_self(); } int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp) { cv_wait(cv, mp); return (1); } int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) { int error; struct timeval tv; struct timespec ts; clock_t delta; delta = abstime - ddi_get_lbolt(); if (delta <= 0) return (-1); VERIFY(gettimeofday(&tv, NULL) == 0); ts.tv_sec = tv.tv_sec + delta / hz; ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz); if (ts.tv_nsec >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; } memset(&mp->m_owner, 0, sizeof (pthread_t)); error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); mp->m_owner = pthread_self(); if (error == ETIMEDOUT) return (-1); VERIFY0(error); return (1); } int cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { (void) res; int error; struct timeval tv; struct timespec ts; hrtime_t delta; ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE); delta = tim; if (flag & CALLOUT_FLAG_ABSOLUTE) delta -= gethrtime(); if (delta <= 0) return (-1); VERIFY0(gettimeofday(&tv, NULL)); ts.tv_sec = tv.tv_sec + delta / NANOSEC; ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC); if (ts.tv_nsec >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; } memset(&mp->m_owner, 0, sizeof (pthread_t)); error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); mp->m_owner = pthread_self(); if (error == ETIMEDOUT) return (-1); VERIFY0(error); return (1); } void cv_signal(kcondvar_t *cv) { VERIFY0(pthread_cond_signal(cv)); } void cv_broadcast(kcondvar_t *cv) { VERIFY0(pthread_cond_broadcast(cv)); } /* * ========================================================================= * procfs list * ========================================================================= */ void seq_printf(struct seq_file *m, const char *fmt, ...) { (void) m, (void) fmt; } void procfs_list_install(const char *module, const char *submodule, const char *name, mode_t mode, procfs_list_t *procfs_list, int (*show)(struct seq_file *f, void *p), int (*show_header)(struct seq_file *f), int (*clear)(procfs_list_t *procfs_list), size_t procfs_list_node_off) { (void) module, (void) submodule, (void) name, (void) mode, (void) show, (void) show_header, (void) clear; mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&procfs_list->pl_list, procfs_list_node_off + sizeof (procfs_list_node_t), procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); procfs_list->pl_next_id = 1; procfs_list->pl_node_offset = procfs_list_node_off; } void procfs_list_uninstall(procfs_list_t *procfs_list) { (void) procfs_list; } void procfs_list_destroy(procfs_list_t *procfs_list) { ASSERT(list_is_empty(&procfs_list->pl_list)); list_destroy(&procfs_list->pl_list); mutex_destroy(&procfs_list->pl_lock); } #define NODE_ID(procfs_list, obj) \ (((procfs_list_node_t *)(((char *)obj) + \ (procfs_list)->pl_node_offset))->pln_id) void procfs_list_add(procfs_list_t *procfs_list, void *p) { ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; list_insert_tail(&procfs_list->pl_list, p); } /* * ========================================================================= * vnode operations * ========================================================================= */ /* * ========================================================================= * Figure out which debugging statements to print * ========================================================================= */ static char *dprintf_string; static int dprintf_print_all; int dprintf_find_string(const char *string) { char *tmp_str = dprintf_string; int len = strlen(string); /* * Find out if this is a string we want to print. * String format: file1.c,function_name1,file2.c,file3.c */ while (tmp_str != NULL) { if (strncmp(tmp_str, string, len) == 0 && (tmp_str[len] == ',' || tmp_str[len] == '\0')) return (1); tmp_str = strchr(tmp_str, ','); if (tmp_str != NULL) tmp_str++; /* Get rid of , */ } return (0); } void dprintf_setup(int *argc, char **argv) { int i, j; /* * Debugging can be specified two ways: by setting the * environment variable ZFS_DEBUG, or by including a * "debug=..." argument on the command line. The command * line setting overrides the environment variable. */ for (i = 1; i < *argc; i++) { int len = strlen("debug="); /* First look for a command line argument */ if (strncmp("debug=", argv[i], len) == 0) { dprintf_string = argv[i] + len; /* Remove from args */ for (j = i; j < *argc; j++) argv[j] = argv[j+1]; argv[j] = NULL; (*argc)--; } } if (dprintf_string == NULL) { /* Look for ZFS_DEBUG environment variable */ dprintf_string = getenv("ZFS_DEBUG"); } /* * Are we just turning on all debugging? */ if (dprintf_find_string("on")) dprintf_print_all = 1; if (dprintf_string != NULL) zfs_flags |= ZFS_DEBUG_DPRINTF; } /* * ========================================================================= * debug printfs * ========================================================================= */ void __dprintf(boolean_t dprint, const char *file, const char *func, int line, const char *fmt, ...) { /* Get rid of annoying "../common/" prefix to filename. */ const char *newfile = zfs_basename(file); va_list adx; if (dprint) { /* dprintf messages are printed immediately */ if (!dprintf_print_all && !dprintf_find_string(newfile) && !dprintf_find_string(func)) return; /* Print out just the function name if requested */ flockfile(stdout); if (dprintf_find_string("pid")) (void) printf("%d ", getpid()); if (dprintf_find_string("tid")) (void) printf("%ju ", (uintmax_t)(uintptr_t)pthread_self()); if (dprintf_find_string("cpu")) (void) printf("%u ", getcpuid()); if (dprintf_find_string("time")) (void) printf("%llu ", gethrtime()); if (dprintf_find_string("long")) (void) printf("%s, line %d: ", newfile, line); (void) printf("dprintf: %s: ", func); va_start(adx, fmt); (void) vprintf(fmt, adx); va_end(adx); funlockfile(stdout); } else { /* zfs_dbgmsg is logged for dumping later */ size_t size; char *buf; int i; size = 1024; buf = umem_alloc(size, UMEM_NOFAIL); i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func); if (i < size) { va_start(adx, fmt); (void) vsnprintf(buf + i, size - i, fmt, adx); va_end(adx); } __zfs_dbgmsg(buf); umem_free(buf, size); } } /* * ========================================================================= * cmn_err() and panic() * ========================================================================= */ static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; __attribute__((noreturn)) void vpanic(const char *fmt, va_list adx) { (void) fprintf(stderr, "error: "); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "\n"); abort(); /* think of it as a "user-level crash dump" */ } __attribute__((noreturn)) void panic(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vpanic(fmt, adx); va_end(adx); } void vcmn_err(int ce, const char *fmt, va_list adx) { if (ce == CE_PANIC) vpanic(fmt, adx); if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ (void) fprintf(stderr, "%s", ce_prefix[ce]); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "%s", ce_suffix[ce]); } } void cmn_err(int ce, const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(ce, fmt, adx); va_end(adx); } /* * ========================================================================= * misc routines * ========================================================================= */ void delay(clock_t ticks) { (void) poll(0, 0, ticks * (1000 / hz)); } /* * Find highest one bit set. * Returns bit number + 1 of highest bit that is set, otherwise returns 0. * The __builtin_clzll() function is supported by both GCC and Clang. */ int highbit64(uint64_t i) { if (i == 0) return (0); return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); } /* * Find lowest one bit set. * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. * The __builtin_ffsll() function is supported by both GCC and Clang. */ int lowbit64(uint64_t i) { if (i == 0) return (0); return (__builtin_ffsll(i)); } const char *random_path = "/dev/random"; const char *urandom_path = "/dev/urandom"; static int random_fd = -1, urandom_fd = -1; void random_init(void) { VERIFY((random_fd = open(random_path, O_RDONLY | O_CLOEXEC)) != -1); VERIFY((urandom_fd = open(urandom_path, O_RDONLY | O_CLOEXEC)) != -1); } void random_fini(void) { close(random_fd); close(urandom_fd); random_fd = -1; urandom_fd = -1; } static int random_get_bytes_common(uint8_t *ptr, size_t len, int fd) { size_t resid = len; ssize_t bytes; ASSERT(fd != -1); while (resid != 0) { bytes = read(fd, ptr, resid); ASSERT3S(bytes, >=, 0); ptr += bytes; resid -= bytes; } return (0); } int random_get_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, random_fd)); } int random_get_pseudo_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, urandom_fd)); } int ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) { errno = 0; *result = strtoull(str, nptr, base); if (*result == 0) return (errno); return (0); } utsname_t * utsname(void) { return (&hw_utsname); } /* * ========================================================================= * kernel emulation setup & teardown * ========================================================================= */ static int umem_out_of_memory(void) { char errmsg[] = "out of memory -- generating core dump\n"; (void) fprintf(stderr, "%s", errmsg); abort(); return (0); } void kernel_init(int mode) { extern uint_t rrw_tsd_key; umem_nofail_callback(umem_out_of_memory); physmem = sysconf(_SC_PHYS_PAGES); dprintf("physmem = %llu pages (%.2f GB)\n", (u_longlong_t)physmem, (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); hostid = (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0; random_init(); VERIFY0(uname(&hw_utsname)); system_taskq_init(); icp_init(); zstd_init(); spa_init((spa_mode_t)mode); fletcher_4_init(); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); } void kernel_fini(void) { fletcher_4_fini(); spa_fini(); zstd_fini(); icp_fini(); system_taskq_fini(); random_fini(); } uid_t crgetuid(cred_t *cr) { (void) cr; return (0); } uid_t crgetruid(cred_t *cr) { (void) cr; return (0); } gid_t crgetgid(cred_t *cr) { (void) cr; return (0); } int crgetngroups(cred_t *cr) { (void) cr; return (0); } gid_t * crgetgroups(cred_t *cr) { (void) cr; return (NULL); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { (void) name, (void) cr; return (0); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { (void) from, (void) to, (void) cr; return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { (void) name, (void) cr; return (0); } int secpolicy_zfs(const cred_t *cr) { (void) cr; return (0); } int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc) { (void) cr, (void) proc; return (0); } ksiddomain_t * ksid_lookupdomain(const char *dom) { ksiddomain_t *kd; kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); kd->kd_name = spa_strdup(dom); return (kd); } void ksiddomain_rele(ksiddomain_t *ksid) { spa_strfree(ksid->kd_name); umem_free(ksid, sizeof (ksiddomain_t)); } char * kmem_vasprintf(const char *fmt, va_list adx) { char *buf = NULL; va_list adx_copy; va_copy(adx_copy, adx); VERIFY(vasprintf(&buf, fmt, adx_copy) != -1); va_end(adx_copy); return (buf); } char * kmem_asprintf(const char *fmt, ...) { char *buf = NULL; va_list adx; va_start(adx, fmt); VERIFY(vasprintf(&buf, fmt, adx) != -1); va_end(adx); return (buf); } /* * kmem_scnprintf() will return the number of characters that it would have * printed whenever it is limited by value of the size variable, rather than * the number of characters that it did print. This can cause misbehavior on * subsequent uses of the return value, so we define a safe version that will * return the number of characters actually printed, minus the NULL format * character. Subsequent use of this by the safe string functions is safe * whether it is snprintf(), strlcat() or strlcpy(). */ int kmem_scnprintf(char *restrict str, size_t size, const char *restrict fmt, ...) { int n; va_list ap; /* Make the 0 case a no-op so that we do not return -1 */ if (size == 0) return (0); va_start(ap, fmt); n = vsnprintf(str, size, fmt, ap); va_end(ap); if (n >= size) n = size - 1; return (n); } zfs_file_t * zfs_onexit_fd_hold(int fd, minor_t *minorp) { (void) fd; *minorp = 0; return (NULL); } void zfs_onexit_fd_rele(zfs_file_t *fp) { (void) fp; } int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, uintptr_t *action_handle) { (void) minor, (void) func, (void) data, (void) action_handle; return (0); } fstrans_cookie_t spl_fstrans_mark(void) { return ((fstrans_cookie_t)0); } void spl_fstrans_unmark(fstrans_cookie_t cookie) { (void) cookie; } int __spl_pf_fstrans_check(void) { return (0); } int kmem_cache_reap_active(void) { return (0); } void zvol_create_minor(const char *name) { (void) name; } void zvol_create_minors_recursive(const char *name) { (void) name; } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { (void) spa, (void) name, (void) async; } void zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, boolean_t async) { (void) spa, (void) oldname, (void) newname, (void) async; } /* * Open file * * path - fully qualified path to file * flags - file attributes O_READ / O_WRITE / O_EXCL * fpp - pointer to return file pointer * * Returns 0 on success underlying error on failure. */ int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) { int fd = -1; int dump_fd = -1; int err; int old_umask = 0; zfs_file_t *fp; struct stat64 st; if (!(flags & O_CREAT) && stat64(path, &st) == -1) return (errno); if (!(flags & O_CREAT) && S_ISBLK(st.st_mode)) flags |= O_DIRECT; if (flags & O_CREAT) old_umask = umask(0); fd = open64(path, flags, mode); if (fd == -1) return (errno); if (flags & O_CREAT) (void) umask(old_umask); if (vn_dumpdir != NULL) { char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); const char *inpath = zfs_basename(path); (void) snprintf(dumppath, MAXPATHLEN, "%s/%s", vn_dumpdir, inpath); dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); umem_free(dumppath, MAXPATHLEN); if (dump_fd == -1) { err = errno; close(fd); return (err); } } else { dump_fd = -1; } (void) fcntl(fd, F_SETFD, FD_CLOEXEC); fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL); fp->f_fd = fd; fp->f_dump_fd = dump_fd; *fpp = fp; return (0); } void zfs_file_close(zfs_file_t *fp) { close(fp->f_fd); if (fp->f_dump_fd != -1) close(fp->f_dump_fd); umem_free(fp, sizeof (zfs_file_t)); } /* * Stateful write - use os internal file pointer to determine where to * write and update on successful completion. * * fp - pointer to file (pipe, socket, etc) to write to * buf - buffer to write * count - # of bytes to write * resid - pointer to count of unwritten bytes (if short write) * * Returns 0 on success errno on failure. */ int zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) { ssize_t rc; rc = write(fp->f_fd, buf, count); if (rc < 0) return (errno); if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * Stateless write - os internal file pointer is not updated. * * fp - pointer to file (pipe, socket, etc) to write to * buf - buffer to write * count - # of bytes to write * off - file offset to write to (only valid for seekable types) * resid - pointer to count of unwritten bytes * * Returns 0 on success errno on failure. */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t pos, ssize_t *resid) { ssize_t rc, split, done; int sectors; /* * To simulate partial disk writes, we split writes into two * system calls so that the process can be killed in between. * This is used by ztest to simulate realistic failure modes. */ sectors = count >> SPA_MINBLOCKSHIFT; split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT; rc = pwrite64(fp->f_fd, buf, split, pos); if (rc != -1) { done = rc; rc = pwrite64(fp->f_fd, (char *)buf + split, count - split, pos + split); } #ifdef __linux__ if (rc == -1 && errno == EINVAL) { /* * Under Linux, this most likely means an alignment issue * (memory or disk) due to O_DIRECT, so we abort() in order * to catch the offender. */ abort(); } #endif if (rc < 0) return (errno); done += rc; if (resid) { *resid = count - done; } else if (done != count) { return (EIO); } return (0); } /* * Stateful read - use os internal file pointer to determine where to * read and update on successful completion. * * fp - pointer to file (pipe, socket, etc) to read from * buf - buffer to write * count - # of bytes to read * resid - pointer to count of unread bytes (if short read) * * Returns 0 on success errno on failure. */ int zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) { int rc; rc = read(fp->f_fd, buf, count); if (rc < 0) return (errno); if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * Stateless read - os internal file pointer is not updated. * * fp - pointer to file (pipe, socket, etc) to read from * buf - buffer to write * count - # of bytes to write * off - file offset to read from (only valid for seekable types) * resid - pointer to count of unwritten bytes (if short write) * * Returns 0 on success errno on failure. */ int zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, ssize_t *resid) { ssize_t rc; rc = pread64(fp->f_fd, buf, count, off); if (rc < 0) { #ifdef __linux__ /* * Under Linux, this most likely means an alignment issue * (memory or disk) due to O_DIRECT, so we abort() in order to * catch the offender. */ if (errno == EINVAL) abort(); #endif return (errno); } if (fp->f_dump_fd != -1) { int status; status = pwrite64(fp->f_dump_fd, buf, rc, off); ASSERT(status != -1); } if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * lseek - set / get file pointer * * fp - pointer to file (pipe, socket, etc) to read from * offp - value to seek to, returns current value plus passed offset * whence - see man pages for standard lseek whence values * * Returns 0 on success errno on failure (ESPIPE for non seekable types) */ int zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) { loff_t rc; rc = lseek(fp->f_fd, *offp, whence); if (rc < 0) return (errno); *offp = rc; return (0); } /* * Get file attributes * * filp - file pointer * zfattr - pointer to file attr structure * * Currently only used for fetching size and file mode * * Returns 0 on success or error code of underlying getattr call on failure. */ int zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) { struct stat64 st; if (fstat64_blk(fp->f_fd, &st) == -1) return (errno); zfattr->zfa_size = st.st_size; zfattr->zfa_mode = st.st_mode; return (0); } /* * Sync file to disk * * filp - file pointer * flags - O_SYNC and or O_DSYNC * * Returns 0 on success or error code of underlying sync call on failure. */ int zfs_file_fsync(zfs_file_t *fp, int flags) { (void) flags; if (fsync(fp->f_fd) < 0) return (errno); return (0); } /* * fallocate - allocate or free space on disk * * fp - file pointer * mode (non-standard options for hole punching etc) * offset - offset to start allocating or freeing from * len - length to free / allocate * * OPTIONAL */ int zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) { #ifdef __linux__ return (fallocate(fp->f_fd, mode, offset, len)); #else (void) fp, (void) mode, (void) offset, (void) len; return (EOPNOTSUPP); #endif } /* * Request current file pointer offset * * fp - pointer to file * * Returns current file offset. */ loff_t zfs_file_off(zfs_file_t *fp) { return (lseek(fp->f_fd, SEEK_CUR, 0)); } /* * unlink file * * path - fully qualified file path * * Returns 0 on success. * * OPTIONAL */ int zfs_file_unlink(const char *path) { return (remove(path)); } /* * Get reference to file pointer * * fd - input file descriptor * * Returns pointer to file struct or NULL. * Unsupported in user space. */ zfs_file_t * zfs_file_get(int fd) { (void) fd; abort(); return (NULL); } /* * Drop reference to file pointer * * fp - pointer to file struct * * Unsupported in user space. */ void zfs_file_put(zfs_file_t *fp) { abort(); (void) fp; } void zfsvfs_update_fromname(const char *oldname, const char *newname) { (void) oldname, (void) newname; } void spa_import_os(spa_t *spa) { (void) spa; } void spa_export_os(spa_t *spa) { (void) spa; } void spa_activate_os(spa_t *spa) { (void) spa; } void spa_deactivate_os(spa_t *spa) { (void) spa; } diff --git a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c b/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c index 8b64369dc29f..44ed697dd490 100644 --- a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c +++ b/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_import_os.c @@ -1,899 +1,898 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2015 RackTop Systems. * Copyright (c) 2016, Intel Corporation. */ /* * Pool import support functions. * * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since * these commands are expected to run in the global zone, we can assume * that the devices are all readable when called. * * To import a pool, we rely on reading the configuration information from the * ZFS label of each device. If we successfully read the label, then we * organize the configuration information in the following hierarchy: * * pool guid -> toplevel vdev guid -> label txg * * Duplicate entries matching this same tuple will be discarded. Once we have * examined every device, we pick the best label txg config for each toplevel * vdev. We then arrange these toplevel vdevs into a complete pool config, and * update any paths that have changed. Finally, we attempt to import the pool * using our derived config, and record the results. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zutil_import.h" #ifdef HAVE_LIBUDEV #include #include #endif #include #define DEV_BYID_PATH "/dev/disk/by-id/" /* * Skip devices with well known prefixes: * there can be side effects when opening devices which need to be avoided. * * hpet - High Precision Event Timer * watchdog[N] - Watchdog must be closed in a special way. */ static boolean_t should_skip_dev(const char *dev) { return ((strcmp(dev, "watchdog") == 0) || (strncmp(dev, "watchdog", 8) == 0 && isdigit(dev[8])) || (strcmp(dev, "hpet") == 0)); } int zfs_dev_flush(int fd) { return (ioctl(fd, BLKFLSBUF)); } void zpool_open_func(void *arg) { rdsk_node_t *rn = arg; libpc_handle_t *hdl = rn->rn_hdl; struct stat64 statbuf; nvlist_t *config; uint64_t vdev_guid = 0; int error; int num_labels = 0; int fd; if (should_skip_dev(zfs_basename(rn->rn_name))) return; /* * Ignore failed stats. We only want regular files and block devices. * Ignore files that are too small to hold a zpool. */ if (stat64(rn->rn_name, &statbuf) != 0 || (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)) || (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE)) return; /* * Preferentially open using O_DIRECT to bypass the block device * cache which may be stale for multipath devices. An EINVAL errno * indicates O_DIRECT is unsupported so fallback to just O_RDONLY. */ fd = open(rn->rn_name, O_RDONLY | O_DIRECT | O_CLOEXEC); if ((fd < 0) && (errno == EINVAL)) fd = open(rn->rn_name, O_RDONLY | O_CLOEXEC); if ((fd < 0) && (errno == EACCES)) hdl->lpc_open_access_error = B_TRUE; if (fd < 0) return; error = zpool_read_label(fd, &config, &num_labels); if (error != 0) { (void) close(fd); return; } if (num_labels == 0) { (void) close(fd); nvlist_free(config); return; } /* * Check that the vdev is for the expected guid. Additional entries * are speculatively added based on the paths stored in the labels. * Entries with valid paths but incorrect guids must be removed. */ error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { (void) close(fd); nvlist_free(config); return; } (void) close(fd); rn->rn_config = config; rn->rn_num_labels = num_labels; /* * Add additional entries for paths described by this label. */ if (rn->rn_labelpaths) { const char *path = NULL; const char *devid = NULL; const char *env = NULL; rdsk_node_t *slice; avl_index_t where; int timeout; int error; if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) return; env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS"); if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 || timeout < 0) { timeout = DISK_LABEL_WAIT; } /* * Allow devlinks to stabilize so all paths are available. */ zpool_label_disk_wait(rn->rn_name, timeout); if (path != NULL) { slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); slice->rn_name = zutil_strdup(hdl, path); slice->rn_vdev_guid = vdev_guid; slice->rn_avl = rn->rn_avl; slice->rn_hdl = hdl; slice->rn_order = IMPORT_ORDER_PREFERRED_1; slice->rn_labelpaths = B_FALSE; pthread_mutex_lock(rn->rn_lock); if (avl_find(rn->rn_avl, slice, &where)) { pthread_mutex_unlock(rn->rn_lock); free(slice->rn_name); free(slice); } else { avl_insert(rn->rn_avl, slice, where); pthread_mutex_unlock(rn->rn_lock); zpool_open_func(slice); } } if (devid != NULL) { slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); error = asprintf(&slice->rn_name, "%s%s", DEV_BYID_PATH, devid); if (error == -1) { free(slice); return; } slice->rn_vdev_guid = vdev_guid; slice->rn_avl = rn->rn_avl; slice->rn_hdl = hdl; slice->rn_order = IMPORT_ORDER_PREFERRED_2; slice->rn_labelpaths = B_FALSE; pthread_mutex_lock(rn->rn_lock); if (avl_find(rn->rn_avl, slice, &where)) { pthread_mutex_unlock(rn->rn_lock); free(slice->rn_name); free(slice); } else { avl_insert(rn->rn_avl, slice, where); pthread_mutex_unlock(rn->rn_lock); zpool_open_func(slice); } } } } static const char * const zpool_default_import_path[] = { "/dev/disk/by-vdev", /* Custom rules, use first if they exist */ "/dev/mapper", /* Use multipath devices before components */ "/dev/disk/by-partlabel", /* Single unique entry set by user */ "/dev/disk/by-partuuid", /* Generated partition uuid */ "/dev/disk/by-label", /* Custom persistent labels */ "/dev/disk/by-uuid", /* Single unique entry and persistent */ "/dev/disk/by-id", /* May be multiple entries and persistent */ "/dev/disk/by-path", /* Encodes physical location and persistent */ "/dev" /* UNSAFE device names will change */ }; const char * const * zpool_default_search_paths(size_t *count) { *count = ARRAY_SIZE(zpool_default_import_path); return (zpool_default_import_path); } /* * Given a full path to a device determine if that device appears in the * import search path. If it does return the first match and store the * index in the passed 'order' variable, otherwise return an error. */ static int zfs_path_order(const char *name, int *order) { const char *env = getenv("ZPOOL_IMPORT_PATH"); if (env) { for (int i = 0; ; ++i) { env += strspn(env, ":"); size_t dirlen = strcspn(env, ":"); if (dirlen) { if (strncmp(name, env, dirlen) == 0) { *order = i; return (0); } env += dirlen; } else break; } } else { for (int i = 0; i < ARRAY_SIZE(zpool_default_import_path); ++i) { if (strncmp(name, zpool_default_import_path[i], strlen(zpool_default_import_path[i])) == 0) { *order = i; return (0); } } } return (ENOENT); } /* * Use libblkid to quickly enumerate all known zfs devices. */ int zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, avl_tree_t **slice_cache) { rdsk_node_t *slice; blkid_cache cache; blkid_dev_iterate iter; blkid_dev dev; avl_index_t where; int error; *slice_cache = NULL; error = blkid_get_cache(&cache, NULL); if (error != 0) return (error); error = blkid_probe_all_new(cache); if (error != 0) { blkid_put_cache(cache); return (error); } iter = blkid_dev_iterate_begin(cache); if (iter == NULL) { blkid_put_cache(cache); return (EINVAL); } /* Only const char *s since 2.32 */ error = blkid_dev_set_search(iter, (char *)"TYPE", (char *)"zfs_member"); if (error != 0) { blkid_dev_iterate_end(iter); blkid_put_cache(cache); return (error); } *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t)); avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); while (blkid_dev_next(iter, &dev) == 0) { slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); slice->rn_name = zutil_strdup(hdl, blkid_dev_devname(dev)); slice->rn_vdev_guid = 0; slice->rn_lock = lock; slice->rn_avl = *slice_cache; slice->rn_hdl = hdl; slice->rn_labelpaths = B_TRUE; error = zfs_path_order(slice->rn_name, &slice->rn_order); if (error == 0) slice->rn_order += IMPORT_ORDER_SCAN_OFFSET; else slice->rn_order = IMPORT_ORDER_DEFAULT; pthread_mutex_lock(lock); if (avl_find(*slice_cache, slice, &where)) { free(slice->rn_name); free(slice); } else { avl_insert(*slice_cache, slice, where); } pthread_mutex_unlock(lock); } blkid_dev_iterate_end(iter); blkid_put_cache(cache); return (0); } /* * Linux persistent device strings for vdev labels * * based on libudev for consistency with libudev disk add/remove events */ typedef struct vdev_dev_strs { char vds_devid[128]; char vds_devphys[128]; } vdev_dev_strs_t; #ifdef HAVE_LIBUDEV /* * Obtain the persistent device id string (describes what) * * used by ZED vdev matching for auto-{online,expand,replace} */ int zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) { struct udev_list_entry *entry; const char *bus; char devbyid[MAXPATHLEN]; /* The bus based by-id path is preferred */ bus = udev_device_get_property_value(dev, "ID_BUS"); if (bus == NULL) { const char *dm_uuid; /* * For multipath nodes use the persistent uuid based identifier * * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f */ dm_uuid = udev_device_get_property_value(dev, "DM_UUID"); if (dm_uuid != NULL) { (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid); return (0); } /* * For volumes use the persistent /dev/zvol/dataset identifier */ entry = udev_device_get_devlinks_list_entry(dev); while (entry != NULL) { const char *name; name = udev_list_entry_get_name(entry); if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { (void) strlcpy(bufptr, name, buflen); return (0); } entry = udev_list_entry_get_next(entry); } /* * NVME 'by-id' symlinks are similar to bus case */ struct udev_device *parent; parent = udev_device_get_parent_with_subsystem_devtype(dev, "nvme", NULL); if (parent != NULL) bus = "nvme"; /* continue with bus symlink search */ else return (ENODATA); } /* * locate the bus specific by-id link */ (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus); entry = udev_device_get_devlinks_list_entry(dev); while (entry != NULL) { const char *name; name = udev_list_entry_get_name(entry); if (strncmp(name, devbyid, strlen(devbyid)) == 0) { name += strlen(DEV_BYID_PATH); (void) strlcpy(bufptr, name, buflen); return (0); } entry = udev_list_entry_get_next(entry); } return (ENODATA); } /* * Obtain the persistent physical location string (describes where) * * used by ZED vdev matching for auto-{online,expand,replace} */ int zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) { const char *physpath = NULL; struct udev_list_entry *entry; /* * Normal disks use ID_PATH for their physical path. */ physpath = udev_device_get_property_value(dev, "ID_PATH"); if (physpath != NULL && strlen(physpath) > 0) { (void) strlcpy(bufptr, physpath, buflen); return (0); } /* * Device mapper devices are virtual and don't have a physical * path. For them we use ID_VDEV instead, which is setup via the * /etc/vdev_id.conf file. ID_VDEV provides a persistent path * to a virtual device. If you don't have vdev_id.conf setup, * you cannot use multipath autoreplace with device mapper. */ physpath = udev_device_get_property_value(dev, "ID_VDEV"); if (physpath != NULL && strlen(physpath) > 0) { (void) strlcpy(bufptr, physpath, buflen); return (0); } /* * For ZFS volumes use the persistent /dev/zvol/dataset identifier */ entry = udev_device_get_devlinks_list_entry(dev); while (entry != NULL) { physpath = udev_list_entry_get_name(entry); if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { (void) strlcpy(bufptr, physpath, buflen); return (0); } entry = udev_list_entry_get_next(entry); } /* * For all other devices fallback to using the by-uuid name. */ entry = udev_device_get_devlinks_list_entry(dev); while (entry != NULL) { physpath = udev_list_entry_get_name(entry); if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) { (void) strlcpy(bufptr, physpath, buflen); return (0); } entry = udev_list_entry_get_next(entry); } return (ENODATA); } /* * A disk is considered a multipath whole disk when: * DEVNAME key value has "dm-" * DM_NAME key value has "mpath" prefix * DM_UUID key exists * ID_PART_TABLE_TYPE key does not exist or is not gpt */ static boolean_t udev_mpath_whole_disk(struct udev_device *dev) { const char *devname, *type, *uuid; devname = udev_device_get_property_value(dev, "DEVNAME"); type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); uuid = udev_device_get_property_value(dev, "DM_UUID"); if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && ((type == NULL) || (strcmp(type, "gpt") != 0)) && (uuid != NULL)) { return (B_TRUE); } return (B_FALSE); } static int udev_device_is_ready(struct udev_device *dev) { #ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED return (udev_device_get_is_initialized(dev)); #else /* wait for DEVLINKS property to be initialized */ return (udev_device_get_property_value(dev, "DEVLINKS") != NULL); #endif } #else int zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) { (void) dev, (void) bufptr, (void) buflen; return (ENODATA); } int zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) { (void) dev, (void) bufptr, (void) buflen; return (ENODATA); } #endif /* HAVE_LIBUDEV */ /* * Wait up to timeout_ms for udev to set up the device node. The device is * considered ready when libudev determines it has been initialized, all of * the device links have been verified to exist, and it has been allowed to - * settle. At this point the device the device can be accessed reliably. - * Depending on the complexity of the udev rules this process could take - * several seconds. + * settle. At this point the device can be accessed reliably. Depending on + * the complexity of the udev rules this process could take several seconds. */ int zpool_label_disk_wait(const char *path, int timeout_ms) { #ifdef HAVE_LIBUDEV struct udev *udev; struct udev_device *dev = NULL; char nodepath[MAXPATHLEN]; char *sysname = NULL; int ret = ENODEV; int settle_ms = 50; long sleep_ms = 10; hrtime_t start, settle; if ((udev = udev_new()) == NULL) return (ENXIO); start = gethrtime(); settle = 0; do { if (sysname == NULL) { if (realpath(path, nodepath) != NULL) { sysname = strrchr(nodepath, '/') + 1; } else { (void) usleep(sleep_ms * MILLISEC); continue; } } dev = udev_device_new_from_subsystem_sysname(udev, "block", sysname); if ((dev != NULL) && udev_device_is_ready(dev)) { struct udev_list_entry *links, *link = NULL; ret = 0; links = udev_device_get_devlinks_list_entry(dev); udev_list_entry_foreach(link, links) { struct stat64 statbuf; const char *name; name = udev_list_entry_get_name(link); errno = 0; if (stat64(name, &statbuf) == 0 && errno == 0) continue; settle = 0; ret = ENODEV; break; } if (ret == 0) { if (settle == 0) { settle = gethrtime(); } else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) { udev_device_unref(dev); break; } } } udev_device_unref(dev); (void) usleep(sleep_ms * MILLISEC); } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); udev_unref(udev); return (ret); #else int settle_ms = 50; long sleep_ms = 10; hrtime_t start, settle; struct stat64 statbuf; start = gethrtime(); settle = 0; do { errno = 0; if ((stat64(path, &statbuf) == 0) && (errno == 0)) { if (settle == 0) settle = gethrtime(); else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) return (0); } else if (errno != ENOENT) { return (errno); } usleep(sleep_ms * MILLISEC); } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); return (ENODEV); #endif /* HAVE_LIBUDEV */ } /* * Encode the persistent devices strings * used for the vdev disk label */ static int encode_device_strings(const char *path, vdev_dev_strs_t *ds, boolean_t wholedisk) { #ifdef HAVE_LIBUDEV struct udev *udev; struct udev_device *dev = NULL; char nodepath[MAXPATHLEN]; char *sysname; int ret = ENODEV; hrtime_t start; if ((udev = udev_new()) == NULL) return (ENXIO); /* resolve path to a runtime device node instance */ if (realpath(path, nodepath) == NULL) goto no_dev; sysname = strrchr(nodepath, '/') + 1; /* * Wait up to 3 seconds for udev to set up the device node context */ start = gethrtime(); do { dev = udev_device_new_from_subsystem_sysname(udev, "block", sysname); if (dev == NULL) goto no_dev; if (udev_device_is_ready(dev)) break; /* udev ready */ udev_device_unref(dev); dev = NULL; if (NSEC2MSEC(gethrtime() - start) < 10) (void) sched_yield(); /* yield/busy wait up to 10ms */ else (void) usleep(10 * MILLISEC); } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC)); if (dev == NULL) goto no_dev; /* * Only whole disks require extra device strings */ if (!wholedisk && !udev_mpath_whole_disk(dev)) goto no_dev; ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid)); if (ret != 0) goto no_dev_ref; /* physical location string (optional) */ if (zfs_device_get_physical(dev, ds->vds_devphys, sizeof (ds->vds_devphys)) != 0) { ds->vds_devphys[0] = '\0'; /* empty string --> not available */ } no_dev_ref: udev_device_unref(dev); no_dev: udev_unref(udev); return (ret); #else (void) path; (void) ds; (void) wholedisk; return (ENOENT); #endif } /* * Rescan the enclosure sysfs path for turning on enclosure LEDs and store it * in the nvlist * (if applicable). Like: * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' */ static void update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path) { char *upath, *spath; /* Add enclosure sysfs path (if disk is in an enclosure). */ upath = zfs_get_underlying_path(path); spath = zfs_get_enclosure_sysfs_path(upath); if (spath) { nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, spath); } else { nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); } free(upath); free(spath); } /* * This will get called for each leaf vdev. */ static int sysfs_path_pool_vdev_iter_f(void *hdl_data, nvlist_t *nv, void *data) { (void) hdl_data, (void) data; const char *path = NULL; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) return (1); /* Rescan our enclosure sysfs path for this vdev */ update_vdev_config_dev_sysfs_path(nv, path); return (0); } /* * Given an nvlist for our pool (with vdev tree), iterate over all the * leaf vdevs and update their ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH. */ void update_vdevs_config_dev_sysfs_path(nvlist_t *config) { nvlist_t *nvroot = NULL; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); for_each_vdev_in_nvlist(nvroot, sysfs_path_pool_vdev_iter_f, NULL); } /* * Update a leaf vdev's persistent device strings * * - only applies for a dedicated leaf vdev (aka whole disk) * - updated during pool create|add|attach|import * - used for matching device matching during auto-{online,expand,replace} * - stored in a leaf disk config label (i.e. alongside 'path' NVP) * - these strings are currently not used in kernel (i.e. for vdev_disk_open) * * single device node example: * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1' * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0' * * multipath device node example: * devid: 'dm-uuid-mpath-35000c5006304de3f' * * We also store the enclosure sysfs path for turning on enclosure LEDs * (if applicable): * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' */ void update_vdev_config_dev_strs(nvlist_t *nv) { vdev_dev_strs_t vds; const char *env, *type, *path; uint64_t wholedisk = 0; /* * For the benefit of legacy ZFS implementations, allow * for opting out of devid strings in the vdev label. * * example use: * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer * * explanation: * Older OpenZFS implementations had issues when attempting to * display pool config VDEV names if a "devid" NVP value is * present in the pool's config. * * For example, a pool that originated on illumos platform would * have a devid value in the config and "zpool status" would fail * when listing the config. * * A pool can be stripped of any "devid" values on import or * prevented from adding them on zpool create|add by setting * ZFS_VDEV_DEVID_OPT_OUT. */ env = getenv("ZFS_VDEV_DEVID_OPT_OUT"); if (env && (strtoul(env, NULL, 0) > 0 || !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) { (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); return; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 || strcmp(type, VDEV_TYPE_DISK) != 0) { return; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) return; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); /* * Update device string values in the config nvlist. */ if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) { (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid); if (vds.vds_devphys[0] != '\0') { (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vds.vds_devphys); } update_vdev_config_dev_sysfs_path(nv, path); } else { /* Clear out any stale entries. */ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); } } diff --git a/sys/contrib/openzfs/man/Makefile.am b/sys/contrib/openzfs/man/Makefile.am index 36c1aede106e..45156571eec3 100644 --- a/sys/contrib/openzfs/man/Makefile.am +++ b/sys/contrib/openzfs/man/Makefile.am @@ -1,135 +1,136 @@ dist_noinst_man_MANS = \ %D%/man1/cstyle.1 dist_man_MANS = \ %D%/man1/arcstat.1 \ %D%/man1/raidz_test.1 \ %D%/man1/test-runner.1 \ %D%/man1/zhack.1 \ %D%/man1/ztest.1 \ %D%/man1/zvol_wait.1 \ \ %D%/man5/vdev_id.conf.5 \ \ %D%/man4/spl.4 \ %D%/man4/zfs.4 \ \ %D%/man7/dracut.zfs.7 \ %D%/man7/vdevprops.7 \ %D%/man7/zfsconcepts.7 \ %D%/man7/zfsprops.7 \ %D%/man7/zpool-features.7 \ %D%/man7/zpoolconcepts.7 \ %D%/man7/zpoolprops.7 \ \ %D%/man8/fsck.zfs.8 \ %D%/man8/mount.zfs.8 \ %D%/man8/vdev_id.8 \ %D%/man8/zdb.8 \ %D%/man8/zfs.8 \ %D%/man8/zfs-allow.8 \ %D%/man8/zfs-bookmark.8 \ %D%/man8/zfs-change-key.8 \ %D%/man8/zfs-clone.8 \ %D%/man8/zfs-create.8 \ %D%/man8/zfs-destroy.8 \ %D%/man8/zfs-diff.8 \ %D%/man8/zfs-get.8 \ %D%/man8/zfs-groupspace.8 \ %D%/man8/zfs-hold.8 \ %D%/man8/zfs-inherit.8 \ %D%/man8/zfs-list.8 \ %D%/man8/zfs-load-key.8 \ %D%/man8/zfs-mount.8 \ %D%/man8/zfs-program.8 \ %D%/man8/zfs-project.8 \ %D%/man8/zfs-projectspace.8 \ %D%/man8/zfs-promote.8 \ %D%/man8/zfs-receive.8 \ %D%/man8/zfs-recv.8 \ %D%/man8/zfs-redact.8 \ %D%/man8/zfs-release.8 \ %D%/man8/zfs-rename.8 \ %D%/man8/zfs-rollback.8 \ %D%/man8/zfs-send.8 \ %D%/man8/zfs-set.8 \ %D%/man8/zfs-share.8 \ %D%/man8/zfs-snapshot.8 \ %D%/man8/zfs-unallow.8 \ %D%/man8/zfs-unload-key.8 \ %D%/man8/zfs-unmount.8 \ %D%/man8/zfs-upgrade.8 \ %D%/man8/zfs-userspace.8 \ %D%/man8/zfs-wait.8 \ %D%/man8/zfs_ids_to_path.8 \ + %D%/man8/zfs_prepare_disk.8 \ %D%/man8/zgenhostid.8 \ %D%/man8/zinject.8 \ %D%/man8/zpool.8 \ %D%/man8/zpool-add.8 \ %D%/man8/zpool-attach.8 \ %D%/man8/zpool-checkpoint.8 \ %D%/man8/zpool-clear.8 \ %D%/man8/zpool-create.8 \ %D%/man8/zpool-destroy.8 \ %D%/man8/zpool-detach.8 \ %D%/man8/zpool-events.8 \ %D%/man8/zpool-export.8 \ %D%/man8/zpool-get.8 \ %D%/man8/zpool-history.8 \ %D%/man8/zpool-import.8 \ %D%/man8/zpool-initialize.8 \ %D%/man8/zpool-iostat.8 \ %D%/man8/zpool-labelclear.8 \ %D%/man8/zpool-list.8 \ %D%/man8/zpool-offline.8 \ %D%/man8/zpool-online.8 \ %D%/man8/zpool-reguid.8 \ %D%/man8/zpool-remove.8 \ %D%/man8/zpool-reopen.8 \ %D%/man8/zpool-replace.8 \ %D%/man8/zpool-resilver.8 \ %D%/man8/zpool-scrub.8 \ %D%/man8/zpool-set.8 \ %D%/man8/zpool-split.8 \ %D%/man8/zpool-status.8 \ %D%/man8/zpool-sync.8 \ %D%/man8/zpool-trim.8 \ %D%/man8/zpool-upgrade.8 \ %D%/man8/zpool-wait.8 \ %D%/man8/zstream.8 \ %D%/man8/zstreamdump.8 \ %D%/man8/zpool_influxdb.8 if BUILD_FREEBSD dist_man_MANS += \ %D%/man8/zfs-jail.8 \ %D%/man8/zfs-unjail.8 endif if BUILD_LINUX dist_man_MANS += \ %D%/man8/zfs-unzone.8 \ %D%/man8/zfs-zone.8 endif nodist_man_MANS = \ %D%/man8/zed.8 \ %D%/man8/zfs-mount-generator.8 dist_noinst_DATA += $(dist_noinst_man_MANS) $(dist_man_MANS) SUBSTFILES += $(nodist_man_MANS) CHECKS += mancheck mancheck: $(top_srcdir)/scripts/mancheck.sh $(srcdir)/%D% if BUILD_LINUX # The manual pager in most Linux distros defaults to "BSD" when .Os is blank, # but leaving it blank makes things a lot easier on # FreeBSD when OpenZFS is vendored in the base system. INSTALL_DATA_HOOKS += man-install-data-hook man-install-data-hook: cd $(DESTDIR)$(mandir) && $(SED) $(ac_inplace) 's/^\.Os$$/.Os OpenZFS/' $(subst %D%/,,$(dist_man_MANS) $(nodist_man_MANS)) endif diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 index 71a3e67ee67e..4ec52a2fb653 100644 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -1,2541 +1,2603 @@ .\" .\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2019 Datto Inc. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at .\" usr/src/OPENSOLARIS.LICENSE or https://opensource.org/licenses/CDDL-1.0. .\" .\" See the License for the specific language governing permissions and .\" limitations under the License. When distributing Covered Code, include this .\" CDDL HEADER in each file and include the License file at .\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" .Dd July 21, 2023 .Dt ZFS 4 .Os . .Sh NAME .Nm zfs .Nd tuning of the ZFS kernel module . .Sh DESCRIPTION The ZFS module supports these parameters: .Bl -tag -width Ds .It Sy dbuf_cache_max_bytes Ns = Ns Sy UINT64_MAX Ns B Pq u64 Maximum size in bytes of the dbuf cache. The target size is determined by the MIN versus .No 1/2^ Ns Sy dbuf_cache_shift Pq 1/32nd of the target ARC size. The behavior of the dbuf cache and its associated settings can be observed via the .Pa /proc/spl/kstat/zfs/dbufstats kstat. . .It Sy dbuf_metadata_cache_max_bytes Ns = Ns Sy UINT64_MAX Ns B Pq u64 Maximum size in bytes of the metadata dbuf cache. The target size is determined by the MIN versus .No 1/2^ Ns Sy dbuf_metadata_cache_shift Pq 1/64th of the target ARC size. The behavior of the metadata dbuf cache and its associated settings can be observed via the .Pa /proc/spl/kstat/zfs/dbufstats kstat. . .It Sy dbuf_cache_hiwater_pct Ns = Ns Sy 10 Ns % Pq uint The percentage over .Sy dbuf_cache_max_bytes when dbufs must be evicted directly. . .It Sy dbuf_cache_lowater_pct Ns = Ns Sy 10 Ns % Pq uint The percentage below .Sy dbuf_cache_max_bytes when the evict thread stops evicting dbufs. . .It Sy dbuf_cache_shift Ns = Ns Sy 5 Pq uint Set the size of the dbuf cache .Pq Sy dbuf_cache_max_bytes to a log2 fraction of the target ARC size. . .It Sy dbuf_metadata_cache_shift Ns = Ns Sy 6 Pq uint Set the size of the dbuf metadata cache .Pq Sy dbuf_metadata_cache_max_bytes to a log2 fraction of the target ARC size. . .It Sy dbuf_mutex_cache_shift Ns = Ns Sy 0 Pq uint Set the size of the mutex array for the dbuf cache. When set to .Sy 0 the array is dynamically sized based on total system memory. . .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint dnode slots allocated in a single operation as a power of 2. The default value minimizes lock contention for the bulk operation performed. . .It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Limit the amount we can prefetch with one call to this amount in bytes. This helps to limit the amount of memory that can be used by prefetching. . .It Sy ignore_hole_birth Pq int Alias for .Sy send_holes_without_birth_time . . .It Sy l2arc_feed_again Ns = Ns Sy 1 Ns | Ns 0 Pq int Turbo L2ARC warm-up. When the L2ARC is cold the fill interval will be set as fast as possible. . .It Sy l2arc_feed_min_ms Ns = Ns Sy 200 Pq u64 Min feed interval in milliseconds. Requires .Sy l2arc_feed_again Ns = Ns Ar 1 and only applicable in related situations. . .It Sy l2arc_feed_secs Ns = Ns Sy 1 Pq u64 Seconds between L2ARC writing. . .It Sy l2arc_headroom Ns = Ns Sy 2 Pq u64 How far through the ARC lists to search for L2ARC cacheable content, expressed as a multiplier of .Sy l2arc_write_max . ARC persistence across reboots can be achieved with persistent L2ARC by setting this parameter to .Sy 0 , allowing the full length of ARC lists to be searched for cacheable content. . .It Sy l2arc_headroom_boost Ns = Ns Sy 200 Ns % Pq u64 Scales .Sy l2arc_headroom by this percentage when L2ARC contents are being successfully compressed before writing. A value of .Sy 100 disables this feature. . .It Sy l2arc_exclude_special Ns = Ns Sy 0 Ns | Ns 1 Pq int Controls whether buffers present on special vdevs are eligible for caching into L2ARC. If set to 1, exclude dbufs on special vdevs from being cached to L2ARC. . .It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int Controls whether only MFU metadata and data are cached from ARC into L2ARC. This may be desired to avoid wasting space on L2ARC when reading/writing large amounts of data that are not expected to be accessed more than once. .Pp The default is off, meaning both MRU and MFU data and metadata are cached. When turning off this feature, some MRU buffers will still be present in ARC and eventually cached on L2ARC. .No If Sy l2arc_noprefetch Ns = Ns Sy 0 , some prefetched buffers will be cached to L2ARC, and those might later transition to MRU, in which case the .Sy l2arc_mru_asize No arcstat will not be Sy 0 . .Pp Regardless of .Sy l2arc_noprefetch , some MFU buffers might be evicted from ARC, accessed later on as prefetches and transition to MRU as prefetches. If accessed again they are counted as MRU and the .Sy l2arc_mru_asize No arcstat will not be Sy 0 . .Pp The ARC status of L2ARC buffers when they were first cached in L2ARC can be seen in the .Sy l2arc_mru_asize , Sy l2arc_mfu_asize , No and Sy l2arc_prefetch_asize arcstats when importing the pool or onlining a cache device if persistent L2ARC is enabled. .Pp The .Sy evict_l2_eligible_mru arcstat does not take into account if this option is enabled as the information provided by the .Sy evict_l2_eligible_m[rf]u arcstats can be used to decide if toggling this option is appropriate for the current workload. . .It Sy l2arc_meta_percent Ns = Ns Sy 33 Ns % Pq uint Percent of ARC size allowed for L2ARC-only headers. Since L2ARC buffers are not evicted on memory pressure, too many headers on a system with an irrationally large L2ARC can render it slow or unusable. This parameter limits L2ARC writes and rebuilds to achieve the target. . .It Sy l2arc_trim_ahead Ns = Ns Sy 0 Ns % Pq u64 Trims ahead of the current write size .Pq Sy l2arc_write_max on L2ARC devices by this percentage of write size if we have filled the device. If set to .Sy 100 we TRIM twice the space required to accommodate upcoming writes. A minimum of .Sy 64 MiB will be trimmed. It also enables TRIM of the whole L2ARC device upon creation or addition to an existing pool or if the header of the device is invalid upon importing a pool or onlining a cache device. A value of .Sy 0 disables TRIM on L2ARC altogether and is the default as it can put significant stress on the underlying storage devices. This will vary depending of how well the specific device handles these commands. . .It Sy l2arc_noprefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int Do not write buffers to L2ARC if they were prefetched but not used by applications. In case there are prefetched buffers in L2ARC and this option is later set, we do not read the prefetched buffers from L2ARC. Unsetting this option is useful for caching sequential reads from the disks to L2ARC and serve those reads from L2ARC later on. This may be beneficial in case the L2ARC device is significantly faster in sequential reads than the disks of the pool. .Pp Use .Sy 1 to disable and .Sy 0 to enable caching/reading prefetches to/from L2ARC. . .It Sy l2arc_norw Ns = Ns Sy 0 Ns | Ns 1 Pq int No reads during writes. . .It Sy l2arc_write_boost Ns = Ns Sy 8388608 Ns B Po 8 MiB Pc Pq u64 Cold L2ARC devices will have .Sy l2arc_write_max increased by this amount while they remain cold. . .It Sy l2arc_write_max Ns = Ns Sy 8388608 Ns B Po 8 MiB Pc Pq u64 Max write bytes per interval. . .It Sy l2arc_rebuild_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Rebuild the L2ARC when importing a pool (persistent L2ARC). This can be disabled if there are problems importing a pool or attaching an L2ARC device (e.g. the L2ARC device is slow in reading stored log metadata, or the metadata has become somehow fragmented/unusable). . .It Sy l2arc_rebuild_blocks_min_l2size Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 Mininum size of an L2ARC device required in order to write log blocks in it. The log blocks are used upon importing the pool to rebuild the persistent L2ARC. .Pp For L2ARC devices less than 1 GiB, the amount of data .Fn l2arc_evict evicts is significant compared to the amount of restored L2ARC data. In this case, do not write log blocks in L2ARC in order not to waste space. . .It Sy metaslab_aliquot Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Metaslab granularity, in bytes. This is roughly similar to what would be referred to as the "stripe size" in traditional RAID arrays. In normal operation, ZFS will try to write this amount of data to each disk before moving on to the next top-level vdev. . .It Sy metaslab_bias_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable metaslab group biasing based on their vdevs' over- or under-utilization relative to the pool. . .It Sy metaslab_force_ganging Ns = Ns Sy 16777217 Ns B Po 16 MiB + 1 B Pc Pq u64 Make some blocks above a certain size be gang blocks. This option is used by the test suite to facilitate testing. . .It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint For blocks that could be forced to be a gang block (due to .Sy metaslab_force_ganging ) , force this many of them to be gang blocks. . .It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP data block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created ones. . .It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP indirect block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created ones. . .It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int Default dnode block size as a power of 2. . .It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int Default dnode indirect block size as a power of 2. . .It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 When attempting to log an output nvlist of an ioctl in the on-disk history, the output will not be stored if it is larger than this size (in bytes). This must be less than .Sy DMU_MAX_ACCESS Pq 64 MiB . This applies primarily to .Fn zfs_ioc_channel_program Pq cf. Xr zfs-program 8 . . .It Sy zfs_keep_log_spacemaps_at_export Ns = Ns Sy 0 Ns | Ns 1 Pq int Prevent log spacemaps from being destroyed during pool exports and destroys. . .It Sy zfs_metaslab_segment_weight_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable/disable segment-based metaslab selection. . .It Sy zfs_metaslab_switch_threshold Ns = Ns Sy 2 Pq int When using segment-based metaslab selection, continue allocating from the active metaslab until this option's worth of buckets have been exhausted. . .It Sy metaslab_debug_load Ns = Ns Sy 0 Ns | Ns 1 Pq int Load all metaslabs during pool import. . .It Sy metaslab_debug_unload Ns = Ns Sy 0 Ns | Ns 1 Pq int Prevent metaslabs from being unloaded. . .It Sy metaslab_fragmentation_factor_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable use of the fragmentation metric in computing metaslab weights. . .It Sy metaslab_df_max_search Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint Maximum distance to search forward from the last offset. Without this limit, fragmented pools can see .Em >100`000 iterations and .Fn metaslab_block_picker becomes the performance limiting factor on high-performance storage. .Pp With the default setting of .Sy 16 MiB , we typically see less than .Em 500 iterations, even with very fragmented .Sy ashift Ns = Ns Sy 9 pools. The maximum number of iterations possible is .Sy metaslab_df_max_search / 2^(ashift+1) . With the default setting of .Sy 16 MiB this is .Em 16*1024 Pq with Sy ashift Ns = Ns Sy 9 or .Em 2*1024 Pq with Sy ashift Ns = Ns Sy 12 . . .It Sy metaslab_df_use_largest_segment Ns = Ns Sy 0 Ns | Ns 1 Pq int If not searching forward (due to .Sy metaslab_df_max_search , metaslab_df_free_pct , .No or Sy metaslab_df_alloc_threshold ) , this tunable controls which segment is used. If set, we will use the largest free segment. If unset, we will use a segment of at least the requested size. . .It Sy zfs_metaslab_max_size_cache_sec Ns = Ns Sy 3600 Ns s Po 1 hour Pc Pq u64 When we unload a metaslab, we cache the size of the largest free chunk. We use that cached size to determine whether or not to load a metaslab for a given allocation. As more frees accumulate in that metaslab while it's unloaded, the cached max size becomes less and less accurate. After a number of seconds controlled by this tunable, we stop considering the cached max size and start considering only the histogram instead. . .It Sy zfs_metaslab_mem_limit Ns = Ns Sy 25 Ns % Pq uint When we are loading a new metaslab, we check the amount of memory being used to store metaslab range trees. If it is over a threshold, we attempt to unload the least recently used metaslab to prevent the system from clogging all of its memory with range trees. This tunable sets the percentage of total system memory that is the threshold. . .It Sy zfs_metaslab_try_hard_before_gang Ns = Ns Sy 0 Ns | Ns 1 Pq int .Bl -item -compact .It If unset, we will first try normal allocation. .It If that fails then we will do a gang allocation. .It If that fails then we will do a "try hard" gang allocation. .It If that fails then we will have a multi-layer gang block. .El .Pp .Bl -item -compact .It If set, we will first try normal allocation. .It If that fails then we will do a "try hard" allocation. .It If that fails we will do a gang allocation. .It If that fails we will do a "try hard" gang allocation. .It If that fails then we will have a multi-layer gang block. .El . .It Sy zfs_metaslab_find_max_tries Ns = Ns Sy 100 Pq uint When not trying hard, we only consider this number of the best metaslabs. This improves performance, especially when there are many metaslabs per vdev and the allocation can't actually be satisfied (so we would otherwise iterate all metaslabs). . .It Sy zfs_vdev_default_ms_count Ns = Ns Sy 200 Pq uint When a vdev is added, target this number of metaslabs per top-level vdev. . .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512 MiB Pc Pq uint Default lower limit for metaslab size. . .It Sy zfs_vdev_max_ms_shift Ns = Ns Sy 34 Po 16 GiB Pc Pq uint Default upper limit for metaslab size. . .It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq uint Maximum ashift used when optimizing for logical \[->] physical sector size on new top-level vdevs. May be increased up to .Sy ASHIFT_MAX Po 16 Pc , but this may negatively impact pool space efficiency. . .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq uint Minimum ashift used when creating new top-level vdevs. . .It Sy zfs_vdev_min_ms_count Ns = Ns Sy 16 Pq uint Minimum number of metaslabs to create in a top-level vdev. . .It Sy vdev_validate_skip Ns = Ns Sy 0 Ns | Ns 1 Pq int Skip label validation steps during pool import. Changing is not recommended unless you know what you're doing and are recovering a damaged label. . .It Sy zfs_vdev_ms_count_limit Ns = Ns Sy 131072 Po 128k Pc Pq uint Practical upper limit of total metaslabs per top-level vdev. . .It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable metaslab group preloading. . .It Sy metaslab_preload_limit Ns = Ns Sy 10 Pq uint Maximum number of metaslabs per group to preload . .It Sy metaslab_preload_pct Ns = Ns Sy 50 Pq uint Percentage of CPUs to run a metaslab preload taskq . .It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Give more weight to metaslabs with lower LBAs, assuming they have greater bandwidth, as is typically the case on a modern constant angular velocity disk drive. . .It Sy metaslab_unload_delay Ns = Ns Sy 32 Pq uint After a metaslab is used, we keep it loaded for this many TXGs, to attempt to reduce unnecessary reloading. Note that both this many TXGs and .Sy metaslab_unload_delay_ms milliseconds must pass before unloading will occur. . .It Sy metaslab_unload_delay_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq uint After a metaslab is used, we keep it loaded for this many milliseconds, to attempt to reduce unnecessary reloading. Note, that both this many milliseconds and .Sy metaslab_unload_delay TXGs must pass before unloading will occur. . .It Sy reference_history Ns = Ns Sy 3 Pq uint Maximum reference holders being tracked when reference_tracking_enable is active. . .It Sy reference_tracking_enable Ns = Ns Sy 0 Ns | Ns 1 Pq int Track reference holders to .Sy refcount_t objects (debug builds only). . .It Sy send_holes_without_birth_time Ns = Ns Sy 1 Ns | Ns 0 Pq int When set, the .Sy hole_birth optimization will not be used, and all holes will always be sent during a .Nm zfs Cm send . This is useful if you suspect your datasets are affected by a bug in .Sy hole_birth . . .It Sy spa_config_path Ns = Ns Pa /etc/zfs/zpool.cache Pq charp SPA config file. . .It Sy spa_asize_inflation Ns = Ns Sy 24 Pq uint Multiplication factor used to estimate actual disk consumption from the size of data being written. The default value is a worst case estimate, but lower values may be valid for a given pool depending on its configuration. Pool administrators who understand the factors involved may wish to specify a more realistic inflation factor, particularly if they operate close to quota or capacity limits. . .It Sy spa_load_print_vdev_tree Ns = Ns Sy 0 Ns | Ns 1 Pq int Whether to print the vdev tree in the debugging message buffer during pool import. . .It Sy spa_load_verify_data Ns = Ns Sy 1 Ns | Ns 0 Pq int Whether to traverse data blocks during an "extreme rewind" .Pq Fl X import. .Pp An extreme rewind import normally performs a full traversal of all blocks in the pool for verification. If this parameter is unset, the traversal skips non-metadata blocks. It can be toggled once the import has started to stop or start the traversal of non-metadata blocks. . .It Sy spa_load_verify_metadata Ns = Ns Sy 1 Ns | Ns 0 Pq int Whether to traverse blocks during an "extreme rewind" .Pq Fl X pool import. .Pp An extreme rewind import normally performs a full traversal of all blocks in the pool for verification. If this parameter is unset, the traversal is not performed. It can be toggled once the import has started to stop or start the traversal. . .It Sy spa_load_verify_shift Ns = Ns Sy 4 Po 1/16th Pc Pq uint Sets the maximum number of bytes to consume during pool import to the log2 fraction of the target ARC size. . .It Sy spa_slop_shift Ns = Ns Sy 5 Po 1/32nd Pc Pq int Normally, we don't allow the last .Sy 3.2% Pq Sy 1/2^spa_slop_shift of space in the pool to be consumed. This ensures that we don't run the pool completely out of space, due to unaccounted changes (e.g. to the MOS). It also limits the worst-case time to allocate space. If we have less than this amount of free space, most ZPL operations (e.g. write, create) will return .Sy ENOSPC . . .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint Limits the number of on-disk error log entries that will be converted to the new format when enabling the .Sy head_errlog feature. The default is to convert all log entries. . .It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint During top-level vdev removal, chunks of data are copied from the vdev which may include free space in order to trade bandwidth for IOPS. This parameter determines the maximum span of free space, in bytes, which will be included as "unnecessary" data in a chunk of copied data. .Pp The default value here was chosen to align with .Sy zfs_vdev_read_gap_limit , which is a similar concept when doing regular reads (but there's no reason it has to be the same). . .It Sy vdev_file_logical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64 Logical ashift for file-based devices. . .It Sy vdev_file_physical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64 Physical ashift for file-based devices. . .It Sy zap_iterate_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int If set, when we start iterating over a ZAP object, prefetch the entire object (all leaf blocks). However, this is limited by .Sy dmu_prefetch_max . . .It Sy zap_micro_max_size Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int Maximum micro ZAP size. A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size. . .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint Min bytes to prefetch per stream. Prefetch distance starts from the demand access size and quickly grows to this value, doubling on each hit. After that it may grow further by 1/8 per hit, but only if some prefetch since last time haven't completed in time to satisfy demand request, i.e. prefetch depth didn't cover the read latency or the pool got saturated. . .It Sy zfetch_max_distance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint Max bytes to prefetch per stream. . .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint Max bytes to prefetch indirects for per stream. . .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint Max number of streams per zfetch (prefetch streams per file). . .It Sy zfetch_min_sec_reap Ns = Ns Sy 1 Pq uint Min time before inactive prefetch stream can be reclaimed . .It Sy zfetch_max_sec_reap Ns = Ns Sy 2 Pq uint Max time before inactive prefetch stream can be deleted . .It Sy zfs_abd_scatter_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enables ARC from using scatter/gather lists and forces all allocations to be linear in kernel memory. Disabling can improve performance in some code paths at the expense of fragmented kernel memory. . .It Sy zfs_abd_scatter_max_order Ns = Ns Sy MAX_ORDER\-1 Pq uint Maximum number of consecutive memory pages allocated in a single block for scatter/gather lists. .Pp The value of .Sy MAX_ORDER depends on kernel configuration. . .It Sy zfs_abd_scatter_min_size Ns = Ns Sy 1536 Ns B Po 1.5 KiB Pc Pq uint This is the minimum allocation size that will use scatter (page-based) ABDs. Smaller allocations will use linear ABDs. . .It Sy zfs_arc_dnode_limit Ns = Ns Sy 0 Ns B Pq u64 When the number of bytes consumed by dnodes in the ARC exceeds this number of bytes, try to unpin some of it in response to demand for non-metadata. This value acts as a ceiling to the amount of dnode metadata, and defaults to .Sy 0 , which indicates that a percent which is based on .Sy zfs_arc_dnode_limit_percent of the ARC meta buffers that may be used for dnodes. .It Sy zfs_arc_dnode_limit_percent Ns = Ns Sy 10 Ns % Pq u64 Percentage that can be consumed by dnodes of ARC meta buffers. .Pp See also .Sy zfs_arc_dnode_limit , which serves a similar purpose but has a higher priority if nonzero. . .It Sy zfs_arc_dnode_reduce_percent Ns = Ns Sy 10 Ns % Pq u64 Percentage of ARC dnodes to try to scan in response to demand for non-metadata when the number of bytes consumed by dnodes exceeds .Sy zfs_arc_dnode_limit . . .It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq uint The ARC's buffer hash table is sized based on the assumption of an average block size of this value. This works out to roughly 1 MiB of hash table per 1 GiB of physical memory with 8-byte pointers. For configurations with a known larger average block size, this value can be increased to reduce the memory footprint. . .It Sy zfs_arc_eviction_pct Ns = Ns Sy 200 Ns % Pq uint When .Fn arc_is_overflowing , .Fn arc_get_data_impl waits for this percent of the requested amount of data to be evicted. For example, by default, for every .Em 2 KiB that's evicted, .Em 1 KiB of it may be "reused" by a new allocation. Since this is above .Sy 100 Ns % , it ensures that progress is made towards getting .Sy arc_size No under Sy arc_c . Since this is finite, it ensures that allocations can still happen, even during the potentially long time that .Sy arc_size No is more than Sy arc_c . . .It Sy zfs_arc_evict_batch_limit Ns = Ns Sy 10 Pq uint Number ARC headers to evict per sub-list before proceeding to another sub-list. This batch-style operation prevents entire sub-lists from being evicted at once but comes at a cost of additional unlocking and locking. . .It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint If set to a non zero value, it will replace the .Sy arc_grow_retry value with this value. The .Sy arc_grow_retry .No value Pq default Sy 5 Ns s is the number of seconds the ARC will wait before trying to resume growth after a memory pressure event. . .It Sy zfs_arc_lotsfree_percent Ns = Ns Sy 10 Ns % Pq int Throttle I/O when free system memory drops below this percentage of total system memory. Setting this value to .Sy 0 will disable the throttle. . .It Sy zfs_arc_max Ns = Ns Sy 0 Ns B Pq u64 Max size of ARC in bytes. If .Sy 0 , then the max size of ARC is determined by the amount of system memory installed. Under Linux, half of system memory will be used as the limit. Under .Fx , the larger of .Sy all_system_memory No \- Sy 1 GiB and .Sy 5/8 No \(mu Sy all_system_memory will be used as the limit. This value must be at least .Sy 67108864 Ns B Pq 64 MiB . .Pp This value can be changed dynamically, with some caveats. It cannot be set back to .Sy 0 while running, and reducing it below the current ARC size will not cause the ARC to shrink without memory pressure to induce shrinking. . .It Sy zfs_arc_meta_balance Ns = Ns Sy 500 Pq uint Balance between metadata and data on ghost hits. Values above 100 increase metadata caching by proportionally reducing effect of ghost data hits on target data/metadata rate. . .It Sy zfs_arc_min Ns = Ns Sy 0 Ns B Pq u64 Min size of ARC in bytes. .No If set to Sy 0 , arc_c_min will default to consuming the larger of .Sy 32 MiB and .Sy all_system_memory No / Sy 32 . . .It Sy zfs_arc_min_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 1s Pc Pq uint Minimum time prefetched blocks are locked in the ARC. . .It Sy zfs_arc_min_prescient_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 6s Pc Pq uint Minimum time "prescient prefetched" blocks are locked in the ARC. These blocks are meant to be prefetched fairly aggressively ahead of the code that may use them. . .It Sy zfs_arc_prune_task_threads Ns = Ns Sy 1 Pq int Number of arc_prune threads. .Fx does not need more than one. Linux may theoretically use one per mount point up to number of CPUs, but that was not proven to be useful. . .It Sy zfs_max_missing_tvds Ns = Ns Sy 0 Pq int Number of missing top-level vdevs which will be allowed during pool import (only in read-only mode). . .It Sy zfs_max_nvlist_src_size Ns = Sy 0 Pq u64 Maximum size in bytes allowed to be passed as .Sy zc_nvlist_src_size for ioctls on .Pa /dev/zfs . This prevents a user from causing the kernel to allocate an excessive amount of memory. When the limit is exceeded, the ioctl fails with .Sy EINVAL and a description of the error is sent to the .Pa zfs-dbgmsg log. This parameter should not need to be touched under normal circumstances. If .Sy 0 , equivalent to a quarter of the user-wired memory limit under .Fx and to .Sy 134217728 Ns B Pq 128 MiB under Linux. . .It Sy zfs_multilist_num_sublists Ns = Ns Sy 0 Pq uint To allow more fine-grained locking, each ARC state contains a series of lists for both data and metadata objects. Locking is performed at the level of these "sub-lists". This parameters controls the number of sub-lists per ARC state, and also applies to other uses of the multilist data structure. .Pp If .Sy 0 , equivalent to the greater of the number of online CPUs and .Sy 4 . . .It Sy zfs_arc_overflow_shift Ns = Ns Sy 8 Pq int The ARC size is considered to be overflowing if it exceeds the current ARC target size .Pq Sy arc_c by thresholds determined by this parameter. Exceeding by .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No / Sy 2 starts ARC reclamation process. If that appears insufficient, exceeding by .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No \(mu Sy 1.5 blocks new buffer allocation until the reclaim thread catches up. Started reclamation process continues till ARC size returns below the target size. .Pp The default value of .Sy 8 causes the ARC to start reclamation if it exceeds the target size by .Em 0.2% of the target size, and block allocations by .Em 0.6% . . .It Sy zfs_arc_shrink_shift Ns = Ns Sy 0 Pq uint If nonzero, this will update .Sy arc_shrink_shift Pq default Sy 7 with the new value. . .It Sy zfs_arc_pc_percent Ns = Ns Sy 0 Ns % Po off Pc Pq uint Percent of pagecache to reclaim ARC to. .Pp This tunable allows the ZFS ARC to play more nicely with the kernel's LRU pagecache. It can guarantee that the ARC size won't collapse under scanning pressure on the pagecache, yet still allows the ARC to be reclaimed down to .Sy zfs_arc_min if necessary. This value is specified as percent of pagecache size (as measured by .Sy NR_FILE_PAGES ) , where that percent may exceed .Sy 100 . This only operates during memory pressure/reclaim. . .It Sy zfs_arc_shrinker_limit Ns = Ns Sy 10000 Pq int This is a limit on how many pages the ARC shrinker makes available for eviction in response to one page allocation attempt. Note that in practice, the kernel's shrinker can ask us to evict up to about four times this for one allocation attempt. .Pp The default limit of .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages limits the amount of time spent attempting to reclaim ARC memory to less than 100 ms per allocation attempt, even with a small average compressed block size of ~8 KiB. .Pp The parameter can be set to 0 (zero) to disable the limit, and only applies on Linux. . .It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq u64 The target number of bytes the ARC should leave as free memory on the system. If zero, equivalent to the bigger of .Sy 512 KiB No and Sy all_system_memory/64 . . .It Sy zfs_autoimport_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int Disable pool import at module load by ignoring the cache file .Pq Sy spa_config_path . . .It Sy zfs_checksum_events_per_second Ns = Ns Sy 20 Ns /s Pq uint Rate limit checksum events to this many per second. Note that this should not be set below the ZED thresholds (currently 10 checksums over 10 seconds) or else the daemon may not trigger any action. . .It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint This controls the amount of time that a ZIL block (lwb) will remain "open" when it isn't "full", and it has a thread waiting for it to be committed to stable storage. The timeout is scaled based on a percentage of the last lwb latency to avoid significantly impacting the latency of each individual transaction record (itx). . .It Sy zfs_condense_indirect_commit_entry_delay_ms Ns = Ns Sy 0 Ns ms Pq int Vdev indirection layer (used for device removal) sleeps for this many milliseconds during mapping generation. Intended for use with the test suite to throttle vdev removal speed. . .It Sy zfs_condense_indirect_obsolete_pct Ns = Ns Sy 25 Ns % Pq uint Minimum percent of obsolete bytes in vdev mapping required to attempt to condense .Pq see Sy zfs_condense_indirect_vdevs_enable . Intended for use with the test suite to facilitate triggering condensing as needed. . .It Sy zfs_condense_indirect_vdevs_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable condensing indirect vdev mappings. When set, attempt to condense indirect vdev mappings if the mapping uses more than .Sy zfs_condense_min_mapping_bytes bytes of memory and if the obsolete space map object uses more than .Sy zfs_condense_max_obsolete_bytes bytes on-disk. The condensing process is an attempt to save memory by removing obsolete mappings. . .It Sy zfs_condense_max_obsolete_bytes Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 Only attempt to condense indirect vdev mappings if the on-disk size of the obsolete space map object is greater than this number of bytes .Pq see Sy zfs_condense_indirect_vdevs_enable . . .It Sy zfs_condense_min_mapping_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq u64 Minimum size vdev mapping to attempt to condense .Pq see Sy zfs_condense_indirect_vdevs_enable . . .It Sy zfs_dbgmsg_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int Internally ZFS keeps a small log to facilitate debugging. The log is enabled by default, and can be disabled by unsetting this option. The contents of the log can be accessed by reading .Pa /proc/spl/kstat/zfs/dbgmsg . Writing .Sy 0 to the file clears the log. .Pp This setting does not influence debug prints due to .Sy zfs_flags . . .It Sy zfs_dbgmsg_maxsize Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint Maximum size of the internal ZFS debug log. . .It Sy zfs_dbuf_state_index Ns = Ns Sy 0 Pq int Historically used for controlling what reporting was available under .Pa /proc/spl/kstat/zfs . No effect. . .It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int When a pool sync operation takes longer than .Sy zfs_deadman_synctime_ms , or when an individual I/O operation takes longer than .Sy zfs_deadman_ziotime_ms , then the operation is considered to be "hung". If .Sy zfs_deadman_enabled is set, then the deadman behavior is invoked as described by .Sy zfs_deadman_failmode . By default, the deadman is enabled and set to .Sy wait which results in "hung" I/O operations only being logged. The deadman is automatically disabled when a pool gets suspended. . .It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp Controls the failure behavior when the deadman detects a "hung" I/O operation. Valid values are: .Bl -tag -compact -offset 4n -width "continue" .It Sy wait Wait for a "hung" operation to complete. For each "hung" operation a "deadman" event will be posted describing that operation. .It Sy continue Attempt to recover from a "hung" operation by re-dispatching it to the I/O pipeline if possible. .It Sy panic Panic the system. This can be used to facilitate automatic fail-over to a properly configured fail-over partner. .El . .It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64 Check time in milliseconds. This defines the frequency at which we check for hung I/O requests and potentially invoke the .Sy zfs_deadman_failmode behavior. . .It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq u64 Interval in milliseconds after which the deadman is triggered and also the interval after which a pool sync operation is considered to be "hung". Once this limit is exceeded the deadman will be invoked every .Sy zfs_deadman_checktime_ms milliseconds until the pool sync completes. . .It Sy zfs_deadman_ziotime_ms Ns = Ns Sy 300000 Ns ms Po 5 min Pc Pq u64 Interval in milliseconds after which the deadman is triggered and an individual I/O operation is considered to be "hung". As long as the operation remains "hung", the deadman will be invoked every .Sy zfs_deadman_checktime_ms milliseconds until the operation completes. . .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int Enable prefetching dedup-ed blocks which are going to be freed. . .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint Start to delay each transaction once there is this amount of dirty data, expressed as a percentage of .Sy zfs_dirty_data_max . This value should be at least .Sy zfs_vdev_async_write_active_max_dirty_percent . .No See Sx ZFS TRANSACTION DELAY . . .It Sy zfs_delay_scale Ns = Ns Sy 500000 Pq int This controls how quickly the transaction delay approaches infinity. Larger values cause longer delays for a given amount of dirty data. .Pp For the smoothest delay, this value should be about 1 billion divided by the maximum number of operations per second. This will smoothly handle between ten times and a tenth of this number. .No See Sx ZFS TRANSACTION DELAY . .Pp .Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 . . .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int Disables requirement for IVset GUIDs to be present and match when doing a raw receive of encrypted datasets. Intended for users whose pools were created with OpenZFS pre-release versions and now have compatibility issues. . .It Sy zfs_key_max_salt_uses Ns = Ns Sy 400000000 Po 4*10^8 Pc Pq ulong Maximum number of uses of a single salt value before generating a new one for encrypted datasets. The default value is also the maximum. . .It Sy zfs_object_mutex_size Ns = Ns Sy 64 Pq uint Size of the znode hashtable used for holds. .Pp Due to the need to hold locks on objects that may not exist yet, kernel mutexes are not created per-object and instead a hashtable is used where collisions will result in objects waiting when there is not actually contention on the same object. . .It Sy zfs_slow_io_events_per_second Ns = Ns Sy 20 Ns /s Pq int Rate limit delay and deadman zevents (which report slow I/O operations) to this many per second. . .It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 Upper-bound limit for unflushed metadata changes to be held by the log spacemap in memory, in bytes. . .It Sy zfs_unflushed_max_mem_ppm Ns = Ns Sy 1000 Ns ppm Po 0.1% Pc Pq u64 Part of overall system memory that ZFS allows to be used for unflushed metadata changes by the log spacemap, in millionths. . .It Sy zfs_unflushed_log_block_max Ns = Ns Sy 131072 Po 128k Pc Pq u64 Describes the maximum number of log spacemap blocks allowed for each pool. The default value means that the space in all the log spacemaps can add up to no more than .Sy 131072 blocks (which means .Em 16 GiB of logical space before compression and ditto blocks, assuming that blocksize is .Em 128 KiB ) . .Pp This tunable is important because it involves a trade-off between import time after an unclean export and the frequency of flushing metaslabs. The higher this number is, the more log blocks we allow when the pool is active which means that we flush metaslabs less often and thus decrease the number of I/O operations for spacemap updates per TXG. At the same time though, that means that in the event of an unclean export, there will be more log spacemap blocks for us to read, inducing overhead in the import time of the pool. The lower the number, the amount of flushing increases, destroying log blocks quicker as they become obsolete faster, which leaves less blocks to be read during import time after a crash. .Pp Each log spacemap block existing during pool import leads to approximately one extra logical I/O issued. This is the reason why this tunable is exposed in terms of blocks rather than space used. . .It Sy zfs_unflushed_log_block_min Ns = Ns Sy 1000 Pq u64 If the number of metaslabs is small and our incoming rate is high, we could get into a situation that we are flushing all our metaslabs every TXG. Thus we always allow at least this many log blocks. . .It Sy zfs_unflushed_log_block_pct Ns = Ns Sy 400 Ns % Pq u64 Tunable used to determine the number of blocks that can be used for the spacemap log, expressed as a percentage of the total number of unflushed metaslabs in the pool. . .It Sy zfs_unflushed_log_txg_max Ns = Ns Sy 1000 Pq u64 Tunable limiting maximum time in TXGs any metaslab may remain unflushed. It effectively limits maximum number of unflushed per-TXG spacemap logs that need to be read after unclean pool export. . .It Sy zfs_unlink_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint When enabled, files will not be asynchronously removed from the list of pending unlinks and the space they consume will be leaked. Once this option has been disabled and the dataset is remounted, the pending unlinks will be processed and the freed space returned to the pool. This option is used by the test suite. . .It Sy zfs_delete_blocks Ns = Ns Sy 20480 Pq ulong This is the used to define a large file for the purposes of deletion. Files containing more than .Sy zfs_delete_blocks will be deleted asynchronously, while smaller files are deleted synchronously. Decreasing this value will reduce the time spent in an .Xr unlink 2 system call, at the expense of a longer delay before the freed space is available. This only applies on Linux. . .It Sy zfs_dirty_data_max Ns = Pq int Determines the dirty space limit in bytes. Once this limit is exceeded, new writes are halted until space frees up. This parameter takes precedence over .Sy zfs_dirty_data_max_percent . .No See Sx ZFS TRANSACTION DELAY . .Pp Defaults to .Sy physical_ram/10 , capped at .Sy zfs_dirty_data_max_max . . .It Sy zfs_dirty_data_max_max Ns = Pq int Maximum allowable value of .Sy zfs_dirty_data_max , expressed in bytes. This limit is only enforced at module load time, and will be ignored if .Sy zfs_dirty_data_max is later changed. This parameter takes precedence over .Sy zfs_dirty_data_max_max_percent . .No See Sx ZFS TRANSACTION DELAY . .Pp Defaults to .Sy min(physical_ram/4, 4GiB) , or .Sy min(physical_ram/4, 1GiB) for 32-bit systems. . .It Sy zfs_dirty_data_max_max_percent Ns = Ns Sy 25 Ns % Pq uint Maximum allowable value of .Sy zfs_dirty_data_max , expressed as a percentage of physical RAM. This limit is only enforced at module load time, and will be ignored if .Sy zfs_dirty_data_max is later changed. The parameter .Sy zfs_dirty_data_max_max takes precedence over this one. .No See Sx ZFS TRANSACTION DELAY . . .It Sy zfs_dirty_data_max_percent Ns = Ns Sy 10 Ns % Pq uint Determines the dirty space limit, expressed as a percentage of all memory. Once this limit is exceeded, new writes are halted until space frees up. The parameter .Sy zfs_dirty_data_max takes precedence over this one. .No See Sx ZFS TRANSACTION DELAY . .Pp Subject to .Sy zfs_dirty_data_max_max . . .It Sy zfs_dirty_data_sync_percent Ns = Ns Sy 20 Ns % Pq uint Start syncing out a transaction group if there's at least this much dirty data .Pq as a percentage of Sy zfs_dirty_data_max . This should be less than .Sy zfs_vdev_async_write_active_min_dirty_percent . . .It Sy zfs_wrlog_data_max Ns = Pq int The upper limit of write-transaction zil log data size in bytes. Write operations are throttled when approaching the limit until log data is cleared out after transaction group sync. Because of some overhead, it should be set at least 2 times the size of .Sy zfs_dirty_data_max .No to prevent harming normal write throughput . It also should be smaller than the size of the slog device if slog is present. .Pp Defaults to .Sy zfs_dirty_data_max*2 . .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be preallocated for a file in order to guarantee that later writes will not run out of space. Instead, .Xr fallocate 2 space preallocation only checks that sufficient space is currently available in the pool or the user's project quota allocation, and then creates a sparse file of the requested size. The requested space is multiplied by .Sy zfs_fallocate_reserve_percent to allow additional space for indirect blocks and other internal metadata. Setting this to .Sy 0 disables support for .Xr fallocate 2 and causes it to return .Sy EOPNOTSUPP . . .It Sy zfs_fletcher_4_impl Ns = Ns Sy fastest Pq string Select a fletcher 4 implementation. .Pp Supported selectors are: .Sy fastest , scalar , sse2 , ssse3 , avx2 , avx512f , avx512bw , .No and Sy aarch64_neon . All except .Sy fastest No and Sy scalar require instruction set extensions to be available, and will only appear if ZFS detects that they are present at runtime. If multiple implementations of fletcher 4 are available, the .Sy fastest will be chosen using a micro benchmark. Selecting .Sy scalar results in the original CPU-based calculation being used. Selecting any option other than .Sy fastest No or Sy scalar results in vector instructions from the respective CPU instruction set being used. . +.It Sy zfs_bclone_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int +Enable the experimental block cloning feature. +If this setting is 0, then even if feature@block_cloning is enabled, +attempts to clone blocks will act as though the feature is disabled. +. .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string Select a BLAKE3 implementation. .Pp Supported selectors are: .Sy cycle , fastest , generic , sse2 , sse41 , avx2 , avx512 . All except .Sy cycle , fastest No and Sy generic require instruction set extensions to be available, and will only appear if ZFS detects that they are present at runtime. If multiple implementations of BLAKE3 are available, the .Sy fastest will be chosen using a micro benchmark. You can see the benchmark results by reading this kstat file: .Pa /proc/spl/kstat/zfs/chksum_bench . . .It Sy zfs_free_bpobj_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable/disable the processing of the free_bpobj object. . .It Sy zfs_async_block_max_blocks Ns = Ns Sy UINT64_MAX Po unlimited Pc Pq u64 Maximum number of blocks freed in a single TXG. . .It Sy zfs_max_async_dedup_frees Ns = Ns Sy 100000 Po 10^5 Pc Pq u64 Maximum number of dedup blocks freed in a single TXG. . .It Sy zfs_vdev_async_read_max_active Ns = Ns Sy 3 Pq uint Maximum asynchronous read I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_read_min_active Ns = Ns Sy 1 Pq uint Minimum asynchronous read I/O operation active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_active_max_dirty_percent Ns = Ns Sy 60 Ns % Pq uint When the pool has more than this much dirty data, use .Sy zfs_vdev_async_write_max_active to limit active async writes. If the dirty data is between the minimum and maximum, the active I/O limit is linearly interpolated. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_active_min_dirty_percent Ns = Ns Sy 30 Ns % Pq uint When the pool has less than this much dirty data, use .Sy zfs_vdev_async_write_min_active to limit active async writes. If the dirty data is between the minimum and maximum, the active I/O limit is linearly interpolated. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_max_active Ns = Ns Sy 10 Pq uint Maximum asynchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_min_active Ns = Ns Sy 2 Pq uint Minimum asynchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . .Pp Lower values are associated with better latency on rotational media but poorer resilver performance. The default value of .Sy 2 was chosen as a compromise. A value of .Sy 3 has been shown to improve resilver performance further at a cost of further increasing latency. . .It Sy zfs_vdev_initializing_max_active Ns = Ns Sy 1 Pq uint Maximum initializing I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_initializing_min_active Ns = Ns Sy 1 Pq uint Minimum initializing I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_max_active Ns = Ns Sy 1000 Pq uint The maximum number of I/O operations active to each device. Ideally, this will be at least the sum of each queue's .Sy max_active . .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_open_timeout_ms Ns = Ns Sy 1000 Pq uint Timeout value to wait before determining a device is missing during import. This is helpful for transient missing paths due to links being briefly removed and recreated in response to udev events. . .It Sy zfs_vdev_rebuild_max_active Ns = Ns Sy 3 Pq uint Maximum sequential resilver I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_rebuild_min_active Ns = Ns Sy 1 Pq uint Minimum sequential resilver I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_removal_max_active Ns = Ns Sy 2 Pq uint Maximum removal I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_removal_min_active Ns = Ns Sy 1 Pq uint Minimum removal I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_scrub_max_active Ns = Ns Sy 2 Pq uint Maximum scrub I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_scrub_min_active Ns = Ns Sy 1 Pq uint Minimum scrub I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_read_max_active Ns = Ns Sy 10 Pq uint Maximum synchronous read I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_read_min_active Ns = Ns Sy 10 Pq uint Minimum synchronous read I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_write_max_active Ns = Ns Sy 10 Pq uint Maximum synchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_write_min_active Ns = Ns Sy 10 Pq uint Minimum synchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_trim_max_active Ns = Ns Sy 2 Pq uint Maximum trim/discard I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_trim_min_active Ns = Ns Sy 1 Pq uint Minimum trim/discard I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_nia_delay Ns = Ns Sy 5 Pq uint For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), the number of concurrently-active I/O operations is limited to .Sy zfs_*_min_active , unless the vdev is "idle". When there are no interactive I/O operations active (synchronous or otherwise), and .Sy zfs_vdev_nia_delay operations have completed since the last interactive operation, then the vdev is considered to be "idle", and the number of concurrently-active non-interactive operations is increased to .Sy zfs_*_max_active . .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_nia_credit Ns = Ns Sy 5 Pq uint Some HDDs tend to prioritize sequential I/O so strongly, that concurrent random I/O latency reaches several seconds. On some HDDs this happens even if sequential I/O operations are submitted one at a time, and so setting .Sy zfs_*_max_active Ns = Sy 1 does not help. To prevent non-interactive I/O, like scrub, from monopolizing the device, no more than .Sy zfs_vdev_nia_credit operations can be sent while there are outstanding incomplete interactive operations. This enforced wait ensures the HDD services the interactive I/O within a reasonable amount of time. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_queue_depth_pct Ns = Ns Sy 1000 Ns % Pq uint Maximum number of queued allocations per top-level vdev expressed as a percentage of .Sy zfs_vdev_async_write_max_active , which allows the system to detect devices that are more capable of handling allocations and to allocate more blocks to those devices. This allows for dynamic allocation distribution when devices are imbalanced, as fuller devices will tend to be slower than empty devices. .Pp Also see .Sy zio_dva_throttle_enabled . . .It Sy zfs_vdev_def_queue_depth Ns = Ns Sy 32 Pq uint Default queue depth for each vdev IO allocator. Higher values allow for better coalescing of sequential writes before sending them to the disk, but can increase transaction commit times. . .It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint Defines if the driver should retire on a given error type. The following options may be bitwise-ored together: .TS box; lbz r l l . Value Name Description _ 1 Device No driver retries on device errors 2 Transport No driver retries on transport errors. 4 Driver No driver retries on driver errors. .TE . .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int Time before expiring .Pa .zfs/snapshot . . .It Sy zfs_admin_snapshot Ns = Ns Sy 0 Ns | Ns 1 Pq int Allow the creation, removal, or renaming of entries in the .Sy .zfs/snapshot directory to cause the creation, destruction, or renaming of snapshots. When enabled, this functionality works both locally and over NFS exports which have the .Em no_root_squash option set. . .It Sy zfs_flags Ns = Ns Sy 0 Pq int Set additional debugging flags. The following flags may be bitwise-ored together: .TS box; lbz r l l . Value Name Description _ 1 ZFS_DEBUG_DPRINTF Enable dprintf entries in the debug log. * 2 ZFS_DEBUG_DBUF_VERIFY Enable extra dbuf verifications. * 4 ZFS_DEBUG_DNODE_VERIFY Enable extra dnode verifications. 8 ZFS_DEBUG_SNAPNAMES Enable snapshot name verification. * 16 ZFS_DEBUG_MODIFY Check for illegally modified ARC buffers. 64 ZFS_DEBUG_ZIO_FREE Enable verification of block frees. 128 ZFS_DEBUG_HISTOGRAM_VERIFY Enable extra spacemap histogram verifications. 256 ZFS_DEBUG_METASLAB_VERIFY Verify space accounting on disk matches in-memory \fBrange_trees\fP. 512 ZFS_DEBUG_SET_ERROR Enable \fBSET_ERROR\fP and dprintf entries in the debug log. 1024 ZFS_DEBUG_INDIRECT_REMAP Verify split blocks created by device removal. 2048 ZFS_DEBUG_TRIM Verify TRIM ranges are always within the allocatable range tree. 4096 ZFS_DEBUG_LOG_SPACEMAP Verify that the log summary is consistent with the spacemap log and enable \fBzfs_dbgmsgs\fP for metaslab loading and flushing. .TE .Sy \& * No Requires debug build . . .It Sy zfs_btree_verify_intensity Ns = Ns Sy 0 Pq uint Enables btree verification. The following settings are culminative: .TS box; lbz r l l . Value Description 1 Verify height. 2 Verify pointers from children to parent. 3 Verify element counts. 4 Verify element order. (expensive) * 5 Verify unused memory is poisoned. (expensive) .TE .Sy \& * No Requires debug build . . .It Sy zfs_free_leak_on_eio Ns = Ns Sy 0 Ns | Ns 1 Pq int If destroy encounters an .Sy EIO while reading metadata (e.g. indirect blocks), space referenced by the missing metadata can not be freed. Normally this causes the background destroy to become "stalled", as it is unable to make forward progress. While in this stalled state, all remaining space to free from the error-encountering filesystem is "temporarily leaked". Set this flag to cause it to ignore the .Sy EIO , permanently leak the space from indirect blocks that can not be read, and continue to free everything else that it can. .Pp The default "stalling" behavior is useful if the storage partially fails (i.e. some but not all I/O operations fail), and then later recovers. In this case, we will be able to continue pool operations while it is partially failed, and when it recovers, we can continue to free the space, with no leaks. Note, however, that this case is actually fairly rare. .Pp Typically pools either .Bl -enum -compact -offset 4n -width "1." .It fail completely (but perhaps temporarily, e.g. due to a top-level vdev going offline), or .It have localized, permanent errors (e.g. disk returns the wrong data due to bit flip or firmware bug). .El In the former case, this setting does not matter because the pool will be suspended and the sync thread will not be able to make forward progress regardless. In the latter, because the error is permanent, the best we can do is leak the minimum amount of space, which is what setting this flag will do. It is therefore reasonable for this flag to normally be set, but we chose the more conservative approach of not setting it, so that there is no possibility of leaking space in the "partial temporary" failure case. . .It Sy zfs_free_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq uint During a .Nm zfs Cm destroy operation using the .Sy async_destroy feature, a minimum of this much time will be spent working on freeing blocks per TXG. . .It Sy zfs_obsolete_min_time_ms Ns = Ns Sy 500 Ns ms Pq uint Similar to .Sy zfs_free_min_time_ms , but for cleanup of old indirection records for removed vdevs. . .It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq s64 Largest data block to write to the ZIL. Larger blocks will be treated as if the dataset being written to had the .Sy logbias Ns = Ns Sy throughput property set. . .It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq u64 Pattern written to vdev free space by .Xr zpool-initialize 8 . . .It Sy zfs_initialize_chunk_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Size of writes used by .Xr zpool-initialize 8 . This option is used by the test suite. . .It Sy zfs_livelist_max_entries Ns = Ns Sy 500000 Po 5*10^5 Pc Pq u64 The threshold size (in block pointers) at which we create a new sub-livelist. Larger sublists are more costly from a memory perspective but the fewer sublists there are, the lower the cost of insertion. . .It Sy zfs_livelist_min_percent_shared Ns = Ns Sy 75 Ns % Pq int If the amount of shared space between a snapshot and its clone drops below this threshold, the clone turns off the livelist and reverts to the old deletion method. This is in place because livelists no long give us a benefit once a clone has been overwritten enough. . .It Sy zfs_livelist_condense_new_alloc Ns = Ns Sy 0 Pq int Incremented each time an extra ALLOC blkptr is added to a livelist entry while it is being condensed. This option is used by the test suite to track race conditions. . .It Sy zfs_livelist_condense_sync_cancel Ns = Ns Sy 0 Pq int Incremented each time livelist condensing is canceled while in .Fn spa_livelist_condense_sync . This option is used by the test suite to track race conditions. . .It Sy zfs_livelist_condense_sync_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int When set, the livelist condense process pauses indefinitely before executing the synctask \(em .Fn spa_livelist_condense_sync . This option is used by the test suite to trigger race conditions. . .It Sy zfs_livelist_condense_zthr_cancel Ns = Ns Sy 0 Pq int Incremented each time livelist condensing is canceled while in .Fn spa_livelist_condense_cb . This option is used by the test suite to track race conditions. . .It Sy zfs_livelist_condense_zthr_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int When set, the livelist condense process pauses indefinitely before executing the open context condensing work in .Fn spa_livelist_condense_cb . This option is used by the test suite to trigger race conditions. . .It Sy zfs_lua_max_instrlimit Ns = Ns Sy 100000000 Po 10^8 Pc Pq u64 The maximum execution time limit that can be set for a ZFS channel program, specified as a number of Lua instructions. . .It Sy zfs_lua_max_memlimit Ns = Ns Sy 104857600 Po 100 MiB Pc Pq u64 The maximum memory limit that can be set for a ZFS channel program, specified in bytes. . .It Sy zfs_max_dataset_nesting Ns = Ns Sy 50 Pq int The maximum depth of nested datasets. This value can be tuned temporarily to fix existing datasets that exceed the predefined limit. . .It Sy zfs_max_log_walking Ns = Ns Sy 5 Pq u64 The number of past TXGs that the flushing algorithm of the log spacemap feature uses to estimate incoming log blocks. . .It Sy zfs_max_logsm_summary_length Ns = Ns Sy 10 Pq u64 Maximum number of rows allowed in the summary of the spacemap log. . .It Sy zfs_max_recordsize Ns = Ns Sy 16777216 Po 16 MiB Pc Pq uint We currently support block sizes from .Em 512 Po 512 B Pc No to Em 16777216 Po 16 MiB Pc . The benefits of larger blocks, and thus larger I/O, need to be weighed against the cost of COWing a giant block to modify one byte. Additionally, very large blocks can have an impact on I/O latency, and also potentially on the memory allocator. Therefore, we formerly forbade creating blocks larger than 1M. Larger blocks could be created by changing it, and pools with larger blocks can always be imported and used, regardless of this setting. . .It Sy zfs_allow_redacted_dataset_mount Ns = Ns Sy 0 Ns | Ns 1 Pq int Allow datasets received with redacted send/receive to be mounted. Normally disabled because these datasets may be missing key data. . .It Sy zfs_min_metaslabs_to_flush Ns = Ns Sy 1 Pq u64 Minimum number of metaslabs to flush per dirty TXG. . .It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 70 Ns % Pq uint Allow metaslabs to keep their active state as long as their fragmentation percentage is no more than this value. An active metaslab that exceeds this threshold will no longer keep its active status allowing better metaslabs to be selected. . .It Sy zfs_mg_fragmentation_threshold Ns = Ns Sy 95 Ns % Pq uint Metaslab groups are considered eligible for allocations if their fragmentation metric (measured as a percentage) is less than or equal to this value. If a metaslab group exceeds this threshold then it will be skipped unless all metaslab groups within the metaslab class have also crossed this threshold. . .It Sy zfs_mg_noalloc_threshold Ns = Ns Sy 0 Ns % Pq uint Defines a threshold at which metaslab groups should be eligible for allocations. The value is expressed as a percentage of free space beyond which a metaslab group is always eligible for allocations. If a metaslab group's free space is less than or equal to the threshold, the allocator will avoid allocating to that group unless all groups in the pool have reached the threshold. Once all groups have reached the threshold, all groups are allowed to accept allocations. The default value of .Sy 0 disables the feature and causes all metaslab groups to be eligible for allocations. .Pp This parameter allows one to deal with pools having heavily imbalanced vdevs such as would be the case when a new vdev has been added. Setting the threshold to a non-zero percentage will stop allocations from being made to vdevs that aren't filled to the specified percentage and allow lesser filled vdevs to acquire more allocations than they otherwise would under the old .Sy zfs_mg_alloc_failures facility. . .It Sy zfs_ddt_data_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int If enabled, ZFS will place DDT data into the special allocation class. . .It Sy zfs_user_indirect_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int If enabled, ZFS will place user data indirect blocks into the special allocation class. . .It Sy zfs_multihost_history Ns = Ns Sy 0 Pq uint Historical statistics for this many latest multihost updates will be available in .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /multihost . . .It Sy zfs_multihost_interval Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq u64 Used to control the frequency of multihost writes which are performed when the .Sy multihost pool property is on. This is one of the factors used to determine the length of the activity check during import. .Pp The multihost write period is .Sy zfs_multihost_interval No / Sy leaf-vdevs . On average a multihost write will be issued for each leaf vdev every .Sy zfs_multihost_interval milliseconds. In practice, the observed period can vary with the I/O load and this observed value is the delay which is stored in the uberblock. . .It Sy zfs_multihost_import_intervals Ns = Ns Sy 20 Pq uint Used to control the duration of the activity test on import. Smaller values of .Sy zfs_multihost_import_intervals will reduce the import time but increase the risk of failing to detect an active pool. The total activity check time is never allowed to drop below one second. .Pp On import the activity check waits a minimum amount of time determined by .Sy zfs_multihost_interval No \(mu Sy zfs_multihost_import_intervals , or the same product computed on the host which last had the pool imported, whichever is greater. The activity check time may be further extended if the value of MMP delay found in the best uberblock indicates actual multihost updates happened at longer intervals than .Sy zfs_multihost_interval . A minimum of .Em 100 ms is enforced. .Pp .Sy 0 No is equivalent to Sy 1 . . .It Sy zfs_multihost_fail_intervals Ns = Ns Sy 10 Pq uint Controls the behavior of the pool when multihost write failures or delays are detected. .Pp When .Sy 0 , multihost write failures or delays are ignored. The failures will still be reported to the ZED which depending on its configuration may take action such as suspending the pool or offlining a device. .Pp Otherwise, the pool will be suspended if .Sy zfs_multihost_fail_intervals No \(mu Sy zfs_multihost_interval milliseconds pass without a successful MMP write. This guarantees the activity test will see MMP writes if the pool is imported. .Sy 1 No is equivalent to Sy 2 ; this is necessary to prevent the pool from being suspended due to normal, small I/O latency variations. . .It Sy zfs_no_scrub_io Ns = Ns Sy 0 Ns | Ns 1 Pq int Set to disable scrub I/O. This results in scrubs not actually scrubbing data and simply doing a metadata crawl of the pool instead. . .It Sy zfs_no_scrub_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int Set to disable block prefetching for scrubs. . .It Sy zfs_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable cache flush operations on disks when writing. Setting this will cause pool corruption on power loss if a volatile out-of-order write cache is enabled. . .It Sy zfs_nopwrite_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Allow no-operation writes. The occurrence of nopwrites will further depend on other pool properties .Pq i.a. the checksumming and compression algorithms . . .It Sy zfs_dmu_offset_next_sync Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable forcing TXG sync to find holes. When enabled forces ZFS to sync data when .Sy SEEK_HOLE No or Sy SEEK_DATA flags are used allowing holes in a file to be accurately reported. When disabled holes will not be reported in recently dirtied files. . .It Sy zfs_pd_bytes_max Ns = Ns Sy 52428800 Ns B Po 50 MiB Pc Pq int The number of bytes which should be prefetched during a pool traversal, like .Nm zfs Cm send or other data crawling operations. . .It Sy zfs_traverse_indirect_prefetch_limit Ns = Ns Sy 32 Pq uint The number of blocks pointed by indirect (non-L0) block which should be prefetched during a pool traversal, like .Nm zfs Cm send or other data crawling operations. . .It Sy zfs_per_txg_dirty_frees_percent Ns = Ns Sy 30 Ns % Pq u64 Control percentage of dirtied indirect blocks from frees allowed into one TXG. After this threshold is crossed, additional frees will wait until the next TXG. .Sy 0 No disables this throttle . . .It Sy zfs_prefetch_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable predictive prefetch. Note that it leaves "prescient" prefetch .Pq for, e.g., Nm zfs Cm send intact. Unlike predictive prefetch, prescient prefetch never issues I/O that ends up not being needed, so it can't hurt performance. . .It Sy zfs_qat_checksum_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable QAT hardware acceleration for SHA256 checksums. May be unset after the ZFS modules have been loaded to initialize the QAT hardware as long as support is compiled in and the QAT driver is present. . .It Sy zfs_qat_compress_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable QAT hardware acceleration for gzip compression. May be unset after the ZFS modules have been loaded to initialize the QAT hardware as long as support is compiled in and the QAT driver is present. . .It Sy zfs_qat_encrypt_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable QAT hardware acceleration for AES-GCM encryption. May be unset after the ZFS modules have been loaded to initialize the QAT hardware as long as support is compiled in and the QAT driver is present. . .It Sy zfs_vnops_read_chunk_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Bytes to read per chunk. . .It Sy zfs_read_history Ns = Ns Sy 0 Pq uint Historical statistics for this many latest reads will be available in .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /reads . . .It Sy zfs_read_history_hits Ns = Ns Sy 0 Ns | Ns 1 Pq int Include cache hits in read history . .It Sy zfs_rebuild_max_segment Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Maximum read segment size to issue when sequentially resilvering a top-level vdev. . .It Sy zfs_rebuild_scrub_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Automatically start a pool scrub when the last active sequential resilver completes in order to verify the checksums of all blocks which have been resilvered. This is enabled by default and strongly recommended. . .It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64 Maximum amount of I/O that can be concurrently issued for a sequential resilver per leaf device, given in bytes. . .It Sy zfs_reconstruct_indirect_combinations_max Ns = Ns Sy 4096 Pq int If an indirect split block contains more than this many possible unique combinations when being reconstructed, consider it too computationally expensive to check them all. Instead, try at most this many randomly selected combinations each time the block is accessed. This allows all segment copies to participate fairly in the reconstruction when all combinations cannot be checked and prevents repeated use of one bad copy. . .It Sy zfs_recover Ns = Ns Sy 0 Ns | Ns 1 Pq int Set to attempt to recover from fatal errors. This should only be used as a last resort, as it typically results in leaked space, or worse. . .It Sy zfs_removal_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int Ignore hard I/O errors during device removal. When set, if a device encounters a hard I/O error during the removal process the removal will not be cancelled. This can result in a normally recoverable block becoming permanently damaged and is hence not recommended. This should only be used as a last resort when the pool cannot be returned to a healthy state prior to removing the device. . .It Sy zfs_removal_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint This is used by the test suite so that it can ensure that certain actions happen while in the middle of a removal. . .It Sy zfs_remove_max_segment Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint The largest contiguous segment that we will attempt to allocate when removing a device. If there is a performance problem with attempting to allocate large blocks, consider decreasing this. The default value is also the maximum. . .It Sy zfs_resilver_disable_defer Ns = Ns Sy 0 Ns | Ns 1 Pq int Ignore the .Sy resilver_defer feature, causing an operation that would start a resilver to immediately restart the one in progress. . .It Sy zfs_resilver_min_time_ms Ns = Ns Sy 3000 Ns ms Po 3 s Pc Pq uint Resilvers are processed by the sync thread. While resilvering, it will spend at least this much time working on a resilver between TXG flushes. . .It Sy zfs_scan_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int If set, remove the DTL (dirty time list) upon completion of a pool scan (scrub), even if there were unrepairable errors. Intended to be used during pool repair or recovery to stop resilvering when the pool is next imported. . .It Sy zfs_scrub_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq uint Scrubs are processed by the sync thread. While scrubbing, it will spend at least this much time working on a scrub between TXG flushes. . .It Sy zfs_scrub_error_blocks_per_txg Ns = Ns Sy 4096 Pq uint Error blocks to be scrubbed in one txg. . .It Sy zfs_scan_checkpoint_intval Ns = Ns Sy 7200 Ns s Po 2 hour Pc Pq uint To preserve progress across reboots, the sequential scan algorithm periodically needs to stop metadata scanning and issue all the verification I/O to disk. The frequency of this flushing is determined by this tunable. . .It Sy zfs_scan_fill_weight Ns = Ns Sy 3 Pq uint This tunable affects how scrub and resilver I/O segments are ordered. A higher number indicates that we care more about how filled in a segment is, while a lower number indicates we care more about the size of the extent without considering the gaps within a segment. This value is only tunable upon module insertion. Changing the value afterwards will have no effect on scrub or resilver performance. . .It Sy zfs_scan_issue_strategy Ns = Ns Sy 0 Pq uint Determines the order that data will be verified while scrubbing or resilvering: .Bl -tag -compact -offset 4n -width "a" .It Sy 1 Data will be verified as sequentially as possible, given the amount of memory reserved for scrubbing .Pq see Sy zfs_scan_mem_lim_fact . This may improve scrub performance if the pool's data is very fragmented. .It Sy 2 The largest mostly-contiguous chunk of found data will be verified first. By deferring scrubbing of small segments, we may later find adjacent data to coalesce and increase the segment size. .It Sy 0 .No Use strategy Sy 1 No during normal verification .No and strategy Sy 2 No while taking a checkpoint . .El . .It Sy zfs_scan_legacy Ns = Ns Sy 0 Ns | Ns 1 Pq int If unset, indicates that scrubs and resilvers will gather metadata in memory before issuing sequential I/O. Otherwise indicates that the legacy algorithm will be used, where I/O is initiated as soon as it is discovered. Unsetting will not affect scrubs or resilvers that are already in progress. . .It Sy zfs_scan_max_ext_gap Ns = Ns Sy 2097152 Ns B Po 2 MiB Pc Pq int Sets the largest gap in bytes between scrub/resilver I/O operations that will still be considered sequential for sorting purposes. Changing this value will not affect scrubs or resilvers that are already in progress. . .It Sy zfs_scan_mem_lim_fact Ns = Ns Sy 20 Ns ^-1 Pq uint Maximum fraction of RAM used for I/O sorting by sequential scan algorithm. This tunable determines the hard limit for I/O sorting memory usage. When the hard limit is reached we stop scanning metadata and start issuing data verification I/O. This is done until we get below the soft limit. . .It Sy zfs_scan_mem_lim_soft_fact Ns = Ns Sy 20 Ns ^-1 Pq uint The fraction of the hard limit used to determined the soft limit for I/O sorting by the sequential scan algorithm. When we cross this limit from below no action is taken. When we cross this limit from above it is because we are issuing verification I/O. In this case (unless the metadata scan is done) we stop issuing verification I/O and start scanning metadata again until we get to the hard limit. . .It Sy zfs_scan_report_txgs Ns = Ns Sy 0 Ns | Ns 1 Pq uint When reporting resilver throughput and estimated completion time use the performance observed over roughly the last .Sy zfs_scan_report_txgs TXGs. When set to zero performance is calculated over the time between checkpoints. . .It Sy zfs_scan_strict_mem_lim Ns = Ns Sy 0 Ns | Ns 1 Pq int Enforce tight memory limits on pool scans when a sequential scan is in progress. When disabled, the memory limit may be exceeded by fast disks. . .It Sy zfs_scan_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq int Freezes a scrub/resilver in progress without actually pausing it. Intended for testing/debugging. . .It Sy zfs_scan_vdev_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int Maximum amount of data that can be concurrently issued at once for scrubs and resilvers per leaf device, given in bytes. . .It Sy zfs_send_corrupt_data Ns = Ns Sy 0 Ns | Ns 1 Pq int Allow sending of corrupt data (ignore read/checksum errors when sending). . .It Sy zfs_send_unmodified_spill_blocks Ns = Ns Sy 1 Ns | Ns 0 Pq int Include unmodified spill blocks in the send stream. Under certain circumstances, previous versions of ZFS could incorrectly remove the spill block from an existing object. Including unmodified copies of the spill blocks creates a backwards-compatible stream which will recreate a spill block if it was incorrectly removed. . .It Sy zfs_send_no_prefetch_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint The fill fraction of the .Nm zfs Cm send internal queues. The fill fraction controls the timing with which internal threads are woken up. . .It Sy zfs_send_no_prefetch_queue_length Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint The maximum number of bytes allowed in .Nm zfs Cm send Ns 's internal queues. . .It Sy zfs_send_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint The fill fraction of the .Nm zfs Cm send prefetch queue. The fill fraction controls the timing with which internal threads are woken up. . .It Sy zfs_send_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint The maximum number of bytes allowed that will be prefetched by .Nm zfs Cm send . This value must be at least twice the maximum block size in use. . .It Sy zfs_recv_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint The fill fraction of the .Nm zfs Cm receive queue. The fill fraction controls the timing with which internal threads are woken up. . .It Sy zfs_recv_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint The maximum number of bytes allowed in the .Nm zfs Cm receive queue. This value must be at least twice the maximum block size in use. . .It Sy zfs_recv_write_batch_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint The maximum amount of data, in bytes, that .Nm zfs Cm receive will write in one DMU transaction. This is the uncompressed size, even when receiving a compressed send stream. This setting will not reduce the write size below a single block. Capped at a maximum of .Sy 32 MiB . . .It Sy zfs_recv_best_effort_corrective Ns = Ns Sy 0 Pq int When this variable is set to non-zero a corrective receive: .Bl -enum -compact -offset 4n -width "1." .It Does not enforce the restriction of source & destination snapshot GUIDs matching. .It If there is an error during healing, the healing receive is not terminated instead it moves on to the next record. .El . .It Sy zfs_override_estimate_recordsize Ns = Ns Sy 0 Ns | Ns 1 Pq uint Setting this variable overrides the default logic for estimating block sizes when doing a .Nm zfs Cm send . The default heuristic is that the average block size will be the current recordsize. Override this value if most data in your dataset is not of that size and you require accurate zfs send size estimates. . .It Sy zfs_sync_pass_deferred_free Ns = Ns Sy 2 Pq uint Flushing of data to disk is done in passes. Defer frees starting in this pass. . .It Sy zfs_spa_discard_memory_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int Maximum memory used for prefetching a checkpoint's space map on each vdev while discarding the checkpoint. . .It Sy zfs_special_class_metadata_reserve_pct Ns = Ns Sy 25 Ns % Pq uint Only allow small data blocks to be allocated on the special and dedup vdev types when the available free space percentage on these vdevs exceeds this value. This ensures reserved space is available for pool metadata as the special vdevs approach capacity. . .It Sy zfs_sync_pass_dont_compress Ns = Ns Sy 8 Pq uint Starting in this sync pass, disable compression (including of metadata). With the default setting, in practice, we don't have this many sync passes, so this has no effect. .Pp The original intent was that disabling compression would help the sync passes to converge. However, in practice, disabling compression increases the average number of sync passes; because when we turn compression off, many blocks' size will change, and thus we have to re-allocate (not overwrite) them. It also increases the number of .Em 128 KiB allocations (e.g. for indirect blocks and spacemaps) because these will not be compressed. The .Em 128 KiB allocations are especially detrimental to performance on highly fragmented systems, which may have very few free segments of this size, and may need to load new metaslabs to satisfy these allocations. . .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint Rewrite new block pointers starting in this pass. . .It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int This controls the number of threads used by .Sy dp_sync_taskq . The default value of .Sy 75% will create a maximum of one thread per CPU. . .It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Maximum size of TRIM command. Larger ranges will be split into chunks no larger than this value before issuing. . .It Sy zfs_trim_extent_bytes_min Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint Minimum size of TRIM commands. TRIM ranges smaller than this will be skipped, unless they're part of a larger range which was chunked. This is done because it's common for these small TRIMs to negatively impact overall performance. . .It Sy zfs_trim_metaslab_skip Ns = Ns Sy 0 Ns | Ns 1 Pq uint Skip uninitialized metaslabs during the TRIM process. This option is useful for pools constructed from large thinly-provisioned devices where TRIM operations are slow. As a pool ages, an increasing fraction of the pool's metaslabs will be initialized, progressively degrading the usefulness of this option. This setting is stored when starting a manual TRIM and will persist for the duration of the requested TRIM. . .It Sy zfs_trim_queue_limit Ns = Ns Sy 10 Pq uint Maximum number of queued TRIMs outstanding per leaf vdev. The number of concurrent TRIM commands issued to the device is controlled by .Sy zfs_vdev_trim_min_active No and Sy zfs_vdev_trim_max_active . . .It Sy zfs_trim_txg_batch Ns = Ns Sy 32 Pq uint The number of transaction groups' worth of frees which should be aggregated before TRIM operations are issued to the device. This setting represents a trade-off between issuing larger, more efficient TRIM operations and the delay before the recently trimmed space is available for use by the device. .Pp Increasing this value will allow frees to be aggregated for a longer time. This will result is larger TRIM operations and potentially increased memory usage. Decreasing this value will have the opposite effect. The default of .Sy 32 was determined to be a reasonable compromise. . .It Sy zfs_txg_history Ns = Ns Sy 0 Pq uint Historical statistics for this many latest TXGs will be available in .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /TXGs . . .It Sy zfs_txg_timeout Ns = Ns Sy 5 Ns s Pq uint Flush dirty data to disk at least every this many seconds (maximum TXG duration). . .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint Max vdev I/O aggregation size. . .It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint Max vdev I/O aggregation size for non-rotating media. . .It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation immediately follows its predecessor on rotational vdevs for the purpose of making decisions based on load. . .It Sy zfs_vdev_mirror_rotating_seek_inc Ns = Ns Sy 5 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation lacks locality as defined by .Sy zfs_vdev_mirror_rotating_seek_offset . Operations within this that are not immediately following the previous operation are incremented by half. . .It Sy zfs_vdev_mirror_rotating_seek_offset Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq int The maximum distance for the last queued I/O operation in which the balancing algorithm considers an operation to have locality. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_mirror_non_rotating_inc Ns = Ns Sy 0 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member on non-rotational vdevs when I/O operations do not immediately follow one another. . .It Sy zfs_vdev_mirror_non_rotating_seek_inc Ns = Ns Sy 1 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation lacks locality as defined by the .Sy zfs_vdev_mirror_rotating_seek_offset . Operations within this that are not immediately following the previous operation are incremented by half. . .It Sy zfs_vdev_read_gap_limit Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint Aggregate read I/O operations if the on-disk gap between them is within this threshold. . .It Sy zfs_vdev_write_gap_limit Ns = Ns Sy 4096 Ns B Po 4 KiB Pc Pq uint Aggregate write I/O operations if the on-disk gap between them is within this threshold. . .It Sy zfs_vdev_raidz_impl Ns = Ns Sy fastest Pq string Select the raidz parity implementation to use. .Pp Variants that don't depend on CPU-specific features may be selected on module load, as they are supported on all systems. The remaining options may only be set after the module is loaded, as they are available only if the implementations are compiled in and supported on the running system. .Pp Once the module is loaded, .Pa /sys/module/zfs/parameters/zfs_vdev_raidz_impl will show the available options, with the currently selected one enclosed in square brackets. .Pp .TS lb l l . fastest selected by built-in benchmark original original implementation scalar scalar implementation sse2 SSE2 instruction set 64-bit x86 ssse3 SSSE3 instruction set 64-bit x86 avx2 AVX2 instruction set 64-bit x86 avx512f AVX512F instruction set 64-bit x86 avx512bw AVX512F & AVX512BW instruction sets 64-bit x86 aarch64_neon NEON Aarch64/64-bit ARMv8 aarch64_neonx2 NEON with more unrolling Aarch64/64-bit ARMv8 powerpc_altivec Altivec PowerPC .TE . .It Sy zfs_vdev_scheduler Pq charp .Sy DEPRECATED . Prints warning to kernel log for compatibility. . .It Sy zfs_zevent_len_max Ns = Ns Sy 512 Pq uint Max event queue length. Events in the queue can be viewed with .Xr zpool-events 8 . . .It Sy zfs_zevent_retain_max Ns = Ns Sy 2000 Pq int Maximum recent zevent records to retain for duplicate checking. Setting this to .Sy 0 disables duplicate detection. . .It Sy zfs_zevent_retain_expire_secs Ns = Ns Sy 900 Ns s Po 15 min Pc Pq int Lifespan for a recent ereport that was retained for duplicate checking. . .It Sy zfs_zil_clean_taskq_maxalloc Ns = Ns Sy 1048576 Pq int The maximum number of taskq entries that are allowed to be cached. When this limit is exceeded transaction records (itxs) will be cleaned synchronously. . .It Sy zfs_zil_clean_taskq_minalloc Ns = Ns Sy 1024 Pq int The number of taskq entries that are pre-populated when the taskq is first created and are immediately available for use. . .It Sy zfs_zil_clean_taskq_nthr_pct Ns = Ns Sy 100 Ns % Pq int This controls the number of threads used by .Sy dp_zil_clean_taskq . The default value of .Sy 100% will create a maximum of one thread per cpu. . .It Sy zil_maxblocksize Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint This sets the maximum block size used by the ZIL. On very fragmented pools, lowering this .Pq typically to Sy 36 KiB can improve performance. . .It Sy zil_maxcopied Ns = Ns Sy 7680 Ns B Po 7.5 KiB Pc Pq uint This sets the maximum number of write bytes logged via WR_COPIED. It tunes a tradeoff between additional memory copy and possibly worse log space efficiency vs additional range lock/unlock. . .It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64 This sets the minimum delay in nanoseconds ZIL care to delay block commit, waiting for more records. If ZIL writes are too fast, kernel may not be able sleep for so short interval, increasing log latency above allowed by .Sy zfs_commit_timeout_pct . . .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable the cache flush commands that are normally sent to disk by the ZIL after an LWB write has completed. Setting this will cause ZIL corruption on power loss if a volatile out-of-order write cache is enabled. . .It Sy zil_replay_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable intent logging replay. Can be disabled for recovery from corrupted ZIL. . -.It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768 KiB Pc Pq u64 +.It Sy zil_slog_bulk Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64 Limit SLOG write size per commit executed with synchronous priority. Any writes above that will be executed with lower (asynchronous) priority to limit potential SLOG device abuse by single active ZIL writer. . .It Sy zfs_zil_saxattr Ns = Ns Sy 1 Ns | Ns 0 Pq int Setting this tunable to zero disables ZIL logging of new .Sy xattr Ns = Ns Sy sa records if the .Sy org.openzfs:zilsaxattr feature is enabled on the pool. This would only be necessary to work around bugs in the ZIL logging or replay code for this record type. The tunable has no effect if the feature is disabled. . .It Sy zfs_embedded_slog_min_ms Ns = Ns Sy 64 Pq uint Usually, one metaslab from each normal-class vdev is dedicated for use by the ZIL to log synchronous writes. However, if there are fewer than .Sy zfs_embedded_slog_min_ms metaslabs in the vdev, this functionality is disabled. This ensures that we don't set aside an unreasonable amount of space for the ZIL. . .It Sy zstd_earlyabort_pass Ns = Ns Sy 1 Pq uint Whether heuristic for detection of incompressible data with zstd levels >= 3 using LZ4 and zstd-1 passes is enabled. . .It Sy zstd_abort_size Ns = Ns Sy 131072 Pq uint Minimal uncompressed size (inclusive) of a record before the early abort heuristic will be attempted. . .It Sy zio_deadman_log_all Ns = Ns Sy 0 Ns | Ns 1 Pq int If non-zero, the zio deadman will produce debugging messages .Pq see Sy zfs_dbgmsg_enable for all zios, rather than only for leaf zios possessing a vdev. This is meant to be used by developers to gain diagnostic information for hang conditions which don't involve a mutex or other locking primitive: typically conditions in which a thread in the zio pipeline is looping indefinitely. . .It Sy zio_slow_io_ms Ns = Ns Sy 30000 Ns ms Po 30 s Pc Pq int When an I/O operation takes more than this much time to complete, it's marked as slow. Each slow operation causes a delay zevent. Slow I/O counters can be seen with .Nm zpool Cm status Fl s . . .It Sy zio_dva_throttle_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Throttle block allocations in the I/O pipeline. This allows for dynamic allocation distribution when devices are imbalanced. When enabled, the maximum number of pending allocations per top-level vdev is limited by .Sy zfs_vdev_queue_depth_pct . . .It Sy zfs_xattr_compat Ns = Ns 0 Ns | Ns 1 Pq int Control the naming scheme used when setting new xattrs in the user namespace. If .Sy 0 .Pq the default on Linux , user namespace xattr names are prefixed with the namespace, to be backwards compatible with previous versions of ZFS on Linux. If .Sy 1 .Pq the default on Fx , user namespace xattr names are not prefixed, to be backwards compatible with previous versions of ZFS on illumos and .Fx . .Pp Either naming scheme can be read on this and future versions of ZFS, regardless of this tunable, but legacy ZFS on illumos or .Fx are unable to read user namespace xattrs written in the Linux format, and legacy versions of ZFS on Linux are unable to read user namespace xattrs written in the legacy ZFS format. .Pp An existing xattr with the alternate naming scheme is removed when overwriting the xattr so as to not accumulate duplicates. . .It Sy zio_requeue_io_start_cut_in_line Ns = Ns Sy 0 Ns | Ns 1 Pq int Prioritize requeued I/O. . .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint Percentage of online CPUs which will run a worker thread for I/O. These workers are responsible for I/O work such as compression and checksum calculations. Fractional number of CPUs will be rounded down. .Pp The default value of .Sy 80% was chosen to avoid using all CPUs which can result in latency issues and inconsistent application performance, especially when slower compression and/or checksumming is enabled. . .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint Number of worker threads per taskq. Lower values improve I/O ordering and CPU utilization, while higher reduces lock contention. .Pp If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. . .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. This may slightly improve startup time on systems with a very large number of zvols. . .It Sy zvol_major Ns = Ns Sy 230 Pq uint Major number for zvol block devices. . .It Sy zvol_max_discard_blocks Ns = Ns Sy 16384 Pq long Discard (TRIM) operations done on zvols will be done in batches of this many blocks, where block size is determined by the .Sy volblocksize property of a zvol. . .It Sy zvol_prefetch_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint When adding a zvol to the system, prefetch this many bytes from the start and end of the volume. Prefetching these regions of the volume is desirable, because they are likely to be accessed immediately by .Xr blkid 8 or the kernel partitioner. . .It Sy zvol_request_sync Ns = Ns Sy 0 Ns | Ns 1 Pq uint When processing I/O requests for a zvol, submit them synchronously. This effectively limits the queue depth to .Em 1 for each I/O submitter. When unset, requests are handled asynchronously by a thread pool. The number of requests which can be handled concurrently is controlled by .Sy zvol_threads . .Sy zvol_request_sync is ignored when running on a kernel that supports block multiqueue .Pq Li blk-mq . . .It Sy zvol_threads Ns = Ns Sy 0 Pq uint The number of system wide threads to use for processing zvol block IOs. If .Sy 0 (the default) then internally set .Sy zvol_threads to the number of CPUs present or 32 (whichever is greater). . +.It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint +The number of threads per zvol to use for queuing IO requests. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +If +.Sy 0 +(the default) then internally set +.Sy zvol_blk_mq_threads +to the number of CPUs present. +. +.It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint +Set to +.Sy 1 +to use the +.Li blk-mq +API for zvols. +Set to +.Sy 0 +(the default) to use the legacy zvol APIs. +This setting can give better or worse zvol performance depending on +the workload. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only read and assigned to a zvol at zvol load time. +. +.It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint +If +.Sy zvol_use_blk_mq +is enabled, then process this number of +.Sy volblocksize Ns -sized blocks per zvol thread. +This tunable can be use to favor better performance for zvol reads (lower +values) or writes (higher values). +If set to +.Sy 0 , +then the zvol layer will process the maximum number of blocks +per thread that it can. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only applied at each zvol's load time. +. +.It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint +The queue_depth value for the zvol +.Li blk-mq +interface. +This parameter will only appear if your kernel supports +.Li blk-mq +and is only applied at each zvol's load time. +If +.Sy 0 +(the default) then use the kernel's default queue depth. +Values are clamped to the kernel's +.Dv BLKDEV_MIN_RQ +and +.Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ +limits. +. .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint Defines zvol block devices behaviour when .Sy volmode Ns = Ns Sy default : .Bl -tag -compact -offset 4n -width "a" .It Sy 1 .No equivalent to Sy full .It Sy 2 .No equivalent to Sy dev .It Sy 3 .No equivalent to Sy none .El . .It Sy zvol_enforce_quotas Ns = Ns Sy 0 Ns | Ns 1 Pq uint Enable strict ZVOL quota enforcement. The strict quota enforcement may have a performance impact. .El . .Sh ZFS I/O SCHEDULER ZFS issues I/O operations to leaf vdevs to satisfy and complete I/O operations. The scheduler determines when and in what order those operations are issued. The scheduler divides operations into five I/O classes, prioritized in the following order: sync read, sync write, async read, async write, and scrub/resilver. Each queue defines the minimum and maximum number of concurrent operations that may be issued to the device. In addition, the device has an aggregate maximum, .Sy zfs_vdev_max_active . Note that the sum of the per-queue minima must not exceed the aggregate maximum. If the sum of the per-queue maxima exceeds the aggregate maximum, then the number of active operations may reach .Sy zfs_vdev_max_active , in which case no further operations will be issued, regardless of whether all per-queue minima have been met. .Pp For many physical devices, throughput increases with the number of concurrent operations, but latency typically suffers. Furthermore, physical devices typically have a limit at which more concurrent operations have no effect on throughput or can actually cause it to decrease. .Pp The scheduler selects the next operation to issue by first looking for an I/O class whose minimum has not been satisfied. Once all are satisfied and the aggregate maximum has not been hit, the scheduler looks for classes whose maximum has not been satisfied. Iteration through the I/O classes is done in the order specified above. No further operations are issued if the aggregate maximum number of concurrent operations has been hit, or if there are no operations queued for an I/O class that has not hit its maximum. Every time an I/O operation is queued or an operation completes, the scheduler looks for new operations to issue. .Pp In general, smaller .Sy max_active Ns s will lead to lower latency of synchronous operations. Larger .Sy max_active Ns s may lead to higher overall throughput, depending on underlying storage. .Pp The ratio of the queues' .Sy max_active Ns s determines the balance of performance between reads, writes, and scrubs. For example, increasing .Sy zfs_vdev_scrub_max_active will cause the scrub or resilver to complete more quickly, but reads and writes to have higher latency and lower throughput. .Pp All I/O classes have a fixed maximum number of outstanding operations, except for the async write class. Asynchronous writes represent the data that is committed to stable storage during the syncing stage for transaction groups. Transaction groups enter the syncing state periodically, so the number of queued async writes will quickly burst up and then bleed down to zero. Rather than servicing them as quickly as possible, the I/O scheduler changes the maximum number of active async write operations according to the amount of dirty data in the pool. Since both throughput and latency typically increase with the number of concurrent operations issued to physical devices, reducing the burstiness in the number of simultaneous operations also stabilizes the response time of operations from other queues, in particular synchronous ones. In broad strokes, the I/O scheduler will issue more concurrent operations from the async write queue as there is more dirty data in the pool. . .Ss Async Writes The number of concurrent operations issued for the async write I/O class follows a piece-wise linear function defined by a few adjustable points: .Bd -literal | o---------| <-- \fBzfs_vdev_async_write_max_active\fP ^ | /^ | | | / | | active | / | | I/O | / | | count | / | | | / | | |-------o | | <-- \fBzfs_vdev_async_write_min_active\fP 0|_______^______|_________| 0% | | 100% of \fBzfs_dirty_data_max\fP | | | `-- \fBzfs_vdev_async_write_active_max_dirty_percent\fP `--------- \fBzfs_vdev_async_write_active_min_dirty_percent\fP .Ed .Pp Until the amount of dirty data exceeds a minimum percentage of the dirty data allowed in the pool, the I/O scheduler will limit the number of concurrent operations to the minimum. As that threshold is crossed, the number of concurrent operations issued increases linearly to the maximum at the specified maximum percentage of the dirty data allowed in the pool. .Pp Ideally, the amount of dirty data on a busy pool will stay in the sloped part of the function between .Sy zfs_vdev_async_write_active_min_dirty_percent and .Sy zfs_vdev_async_write_active_max_dirty_percent . If it exceeds the maximum percentage, this indicates that the rate of incoming data is greater than the rate that the backend storage can handle. In this case, we must further throttle incoming writes, as described in the next section. . .Sh ZFS TRANSACTION DELAY We delay transactions when we've determined that the backend storage isn't able to accommodate the rate of incoming writes. .Pp If there is already a transaction waiting, we delay relative to when that transaction will finish waiting. This way the calculated delay time is independent of the number of threads concurrently executing transactions. .Pp If we are the only waiter, wait relative to when the transaction started, rather than the current time. This credits the transaction for "time already served", e.g. reading indirect blocks. .Pp The minimum time for a transaction to take is calculated as .D1 min_time = min( Ns Sy zfs_delay_scale No \(mu Po Sy dirty No \- Sy min Pc / Po Sy max No \- Sy dirty Pc , 100ms) .Pp The delay has two degrees of freedom that can be adjusted via tunables. The percentage of dirty data at which we start to delay is defined by .Sy zfs_delay_min_dirty_percent . This should typically be at or above .Sy zfs_vdev_async_write_active_max_dirty_percent , so that we only start to delay after writing at full speed has failed to keep up with the incoming write rate. The scale of the curve is defined by .Sy zfs_delay_scale . Roughly speaking, this variable determines the amount of delay at the midpoint of the curve. .Bd -literal delay 10ms +-------------------------------------------------------------*+ | *| 9ms + *+ | *| 8ms + *+ | * | 7ms + * + | * | 6ms + * + | * | 5ms + * + | * | 4ms + * + | * | 3ms + * + | * | 2ms + (midpoint) * + | | ** | 1ms + v *** + | \fBzfs_delay_scale\fP ----------> ******** | 0 +-------------------------------------*********----------------+ 0% <- \fBzfs_dirty_data_max\fP -> 100% .Ed .Pp Note, that since the delay is added to the outstanding time remaining on the most recent transaction it's effectively the inverse of IOPS. Here, the midpoint of .Em 500 us translates to .Em 2000 IOPS . The shape of the curve was chosen such that small changes in the amount of accumulated dirty data in the first three quarters of the curve yield relatively small differences in the amount of delay. .Pp The effects can be easier to understand when the amount of delay is represented on a logarithmic scale: .Bd -literal delay 100ms +-------------------------------------------------------------++ + + | | + *+ 10ms + *+ + ** + | (midpoint) ** | + | ** + 1ms + v **** + + \fBzfs_delay_scale\fP ----------> ***** + | **** | + **** + 100us + ** + + * + | * | + * + 10us + * + + + | | + + +--------------------------------------------------------------+ 0% <- \fBzfs_dirty_data_max\fP -> 100% .Ed .Pp Note here that only as the amount of dirty data approaches its limit does the delay start to increase rapidly. The goal of a properly tuned system should be to keep the amount of dirty data out of that range by first ensuring that the appropriate limits are set for the I/O scheduler to reach optimal throughput on the back-end storage, and then by changing the value of .Sy zfs_delay_scale to increase the steepness of the curve. diff --git a/sys/contrib/openzfs/man/man7/zpool-features.7 b/sys/contrib/openzfs/man/man7/zpool-features.7 index b901ce6c2935..8ca4bd927b24 100644 --- a/sys/contrib/openzfs/man/man7/zpool-features.7 +++ b/sys/contrib/openzfs/man/man7/zpool-features.7 @@ -1,954 +1,963 @@ .\" .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at .\" usr/src/OPENSOLARIS.LICENSE or https://opensource.org/licenses/CDDL-1.0. .\" .\" See the License for the specific language governing permissions and .\" limitations under the License. When distributing Covered Code, include this .\" CDDL HEADER in each file and include the License file at .\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" Copyright (c) 2019, Klara Inc. .\" Copyright (c) 2019, Allan Jude .\" Copyright (c) 2021, Colm Buckley .\" .Dd June 23, 2022 .Dt ZPOOL-FEATURES 7 .Os . .Sh NAME .Nm zpool-features .Nd description of ZFS pool features . .Sh DESCRIPTION ZFS pool on-disk format versions are specified via .Dq features which replace the old on-disk format numbers .Pq the last supported on-disk format number is 28 . To enable a feature on a pool use the .Nm zpool Cm upgrade , or set the .Sy feature Ns @ Ns Ar feature-name property to .Sy enabled . Please also see the .Sx Compatibility feature sets section for information on how sets of features may be enabled together. .Pp The pool format does not affect file system version compatibility or the ability to send file systems between pools. .Pp Since most features can be enabled independently of each other, the on-disk format of the pool is specified by the set of all features marked as .Sy active on the pool. If the pool was created by another software version this set may include unsupported features. . .Ss Identifying features Every feature has a GUID of the form .Ar com.example : Ns Ar feature-name . The reversed DNS name ensures that the feature's GUID is unique across all ZFS implementations. When unsupported features are encountered on a pool they will be identified by their GUIDs. Refer to the documentation for the ZFS implementation that created the pool for information about those features. .Pp Each supported feature also has a short name. By convention a feature's short name is the portion of its GUID which follows the .Sq \&: .Po i.e. .Ar com.example : Ns Ar feature-name would have the short name .Ar feature-name .Pc , however a feature's short name may differ across ZFS implementations if following the convention would result in name conflicts. . .Ss Feature states Features can be in one of three states: .Bl -tag -width "disabled" .It Sy active This feature's on-disk format changes are in effect on the pool. Support for this feature is required to import the pool in read-write mode. If this feature is not read-only compatible, support is also required to import the pool in read-only mode .Pq see Sx Read-only compatibility . .It Sy enabled An administrator has marked this feature as enabled on the pool, but the feature's on-disk format changes have not been made yet. The pool can still be imported by software that does not support this feature, but changes may be made to the on-disk format at any time which will move the feature to the .Sy active state. Some features may support returning to the .Sy enabled state after becoming .Sy active . See feature-specific documentation for details. .It Sy disabled This feature's on-disk format changes have not been made and will not be made unless an administrator moves the feature to the .Sy enabled state. Features cannot be disabled once they have been enabled. .El .Pp The state of supported features is exposed through pool properties of the form .Sy feature Ns @ Ns Ar short-name . . .Ss Read-only compatibility Some features may make on-disk format changes that do not interfere with other software's ability to read from the pool. These features are referred to as .Dq read-only compatible . If all unsupported features on a pool are read-only compatible, the pool can be imported in read-only mode by setting the .Sy readonly property during import .Po see .Xr zpool-import 8 for details on importing pools .Pc . . .Ss Unsupported features For each unsupported feature enabled on an imported pool, a pool property named .Sy unsupported Ns @ Ns Ar feature-name will indicate why the import was allowed despite the unsupported feature. Possible values for this property are: .Bl -tag -width "readonly" .It Sy inactive The feature is in the .Sy enabled state and therefore the pool's on-disk format is still compatible with software that does not support this feature. .It Sy readonly The feature is read-only compatible and the pool has been imported in read-only mode. .El . .Ss Feature dependencies Some features depend on other features being enabled in order to function. Enabling a feature will automatically enable any features it depends on. . .Ss Compatibility feature sets It is sometimes necessary for a pool to maintain compatibility with a specific on-disk format, by enabling and disabling particular features. The .Sy compatibility feature facilitates this by allowing feature sets to be read from text files. When set to .Sy off .Pq the default , compatibility feature sets are disabled .Pq i.e. all features are enabled ; when set to .Sy legacy , no features are enabled. When set to a comma-separated list of filenames .Po each filename may either be an absolute path, or relative to .Pa /etc/zfs/compatibility.d or .Pa /usr/share/zfs/compatibility.d .Pc , the lists of requested features are read from those files, separated by whitespace and/or commas. Only features present in all files are enabled. .Pp Simple sanity checks are applied to the files: they must be between 1 B and 16 KiB in size, and must end with a newline character. .Pp The requested features are applied when a pool is created using .Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar … and controls which features are enabled when using .Nm zpool Cm upgrade . .Nm zpool Cm status will not show a warning about disabled features which are not part of the requested feature set. .Pp The special value .Sy legacy prevents any features from being enabled, either via .Nm zpool Cm upgrade or .Nm zpool Cm set Sy feature Ns @ Ns Ar feature-name Ns = Ns Sy enabled . This setting also prevents pools from being upgraded to newer on-disk versions. This is a safety measure to prevent new features from being accidentally enabled, breaking compatibility. .Pp By convention, compatibility files in .Pa /usr/share/zfs/compatibility.d are provided by the distribution, and include feature sets supported by important versions of popular distributions, and feature sets commonly supported at the start of each year. Compatibility files in .Pa /etc/zfs/compatibility.d , if present, will take precedence over files with the same name in .Pa /usr/share/zfs/compatibility.d . .Pp If an unrecognized feature is found in these files, an error message will be shown. If the unrecognized feature is in a file in .Pa /etc/zfs/compatibility.d , this is treated as an error and processing will stop. If the unrecognized feature is under .Pa /usr/share/zfs/compatibility.d , this is treated as a warning and processing will continue. This difference is to allow distributions to include features which might not be recognized by the currently-installed binaries. .Pp Compatibility files may include comments: any text from .Sq # to the end of the line is ignored. .Pp .Sy Example : .Bd -literal -compact -offset 4n .No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2 # Features which are supported by GRUB2 +allocation_classes async_destroy +block_cloning bookmarks +device_rebuild embedded_data empty_bpobj enabled_txg extensible_dataset filesystem_limits hole_birth large_blocks livelist +log_spacemap lz4_compress +project_quota +resilver_defer spacemap_histogram +spacemap_v2 +userobj_accounting +zilsaxattr zpool_checkpoint .No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev .Ed .Pp See .Xr zpool-create 8 and .Xr zpool-upgrade 8 for more information on how these commands are affected by feature sets. . .de feature .It Sy \\$2 .Bl -tag -compact -width "READ-ONLY COMPATIBLE" .It GUID .Sy \\$1:\\$2 .if !"\\$4"" \{\ .It DEPENDENCIES \fB\\$4\fP\c .if !"\\$5"" , \fB\\$5\fP\c .if !"\\$6"" , \fB\\$6\fP\c .if !"\\$7"" , \fB\\$7\fP\c .if !"\\$8"" , \fB\\$8\fP\c .if !"\\$9"" , \fB\\$9\fP\c .\} .It READ-ONLY COMPATIBLE \\$3 .El .Pp .. . .ds instant-never \ .No This feature becomes Sy active No as soon as it is enabled \ and will never return to being Sy enabled . . .ds remount-upgrade \ .No Each filesystem will be upgraded automatically when remounted, \ or when a new file is created under that filesystem. \ The upgrade can also be triggered on filesystems via \ Nm zfs Cm set Sy version Ns = Ns Sy current Ar fs . \ No The upgrade process runs in the background and may take a while to complete \ for filesystems containing large amounts of files . . .de checksum-spiel When the .Sy \\$1 feature is set to .Sy enabled , the administrator can turn on the .Sy \\$1 checksum on any dataset using .Nm zfs Cm set Sy checksum Ns = Ns Sy \\$1 Ar dset .Po see Xr zfs-set 8 Pc . This feature becomes .Sy active once a .Sy checksum property has been set to .Sy \\$1 , and will return to being .Sy enabled once all filesystems that have ever had their checksum set to .Sy \\$1 are destroyed. .. . .Sh FEATURES The following features are supported on this system: .Bl -tag -width Ds .feature org.zfsonlinux allocation_classes yes This feature enables support for separate allocation classes. .Pp This feature becomes .Sy active when a dedicated allocation class vdev .Pq dedup or special is created with the .Nm zpool Cm create No or Nm zpool Cm add No commands . With device removal, it can be returned to the .Sy enabled state if all the dedicated allocation class vdevs are removed. . .feature com.delphix async_destroy yes Destroying a file system requires traversing all of its data in order to return its used space to the pool. Without .Sy async_destroy , the file system is not fully removed until all space has been reclaimed. If the destroy operation is interrupted by a reboot or power outage, the next attempt to open the pool will need to complete the destroy operation synchronously. .Pp When .Sy async_destroy is enabled, the file system's data will be reclaimed by a background process, allowing the destroy operation to complete without traversing the entire file system. The background process is able to resume interrupted destroys after the pool has been opened, eliminating the need to finish interrupted destroys as part of the open operation. The amount of space remaining to be reclaimed by the background process is available through the .Sy freeing property. .Pp This feature is only .Sy active while .Sy freeing is non-zero. . .feature org.openzfs blake3 no extensible_dataset This feature enables the use of the BLAKE3 hash algorithm for checksum and dedup. BLAKE3 is a secure hash algorithm focused on high performance. .Pp .checksum-spiel blake3 . .feature com.fudosecurity block_cloning yes When this feature is enabled ZFS will use block cloning for operations like .Fn copy_file_range 2 . Block cloning allows to create multiple references to a single block. It is much faster than copying the data (as the actual data is neither read nor written) and takes no additional space. Blocks can be cloned across datasets under some conditions (like disabled encryption and equal .Nm recordsize ) . .Pp This feature becomes .Sy active when first block is cloned. When the last cloned block is freed, it goes back to the enabled state. .feature com.delphix bookmarks yes extensible_dataset This feature enables use of the .Nm zfs Cm bookmark command. .Pp This feature is .Sy active while any bookmarks exist in the pool. All bookmarks in the pool can be listed by running .Nm zfs Cm list Fl t Sy bookmark Fl r Ar poolname . . .feature com.datto bookmark_v2 no bookmark extensible_dataset This feature enables the creation and management of larger bookmarks which are needed for other features in ZFS. .Pp This feature becomes .Sy active when a v2 bookmark is created and will be returned to the .Sy enabled state when all v2 bookmarks are destroyed. . .feature com.delphix bookmark_written no bookmark extensible_dataset bookmark_v2 This feature enables additional bookmark accounting fields, enabling the .Sy written Ns # Ns Ar bookmark property .Pq space written since a bookmark and estimates of send stream sizes for incrementals from bookmarks. .Pp This feature becomes .Sy active when a bookmark is created and will be returned to the .Sy enabled state when all bookmarks with these fields are destroyed. . .feature org.openzfs device_rebuild yes This feature enables the ability for the .Nm zpool Cm attach and .Nm zpool Cm replace commands to perform sequential reconstruction .Pq instead of healing reconstruction when resilvering. .Pp Sequential reconstruction resilvers a device in LBA order without immediately verifying the checksums. Once complete, a scrub is started, which then verifies the checksums. This approach allows full redundancy to be restored to the pool in the minimum amount of time. This two-phase approach will take longer than a healing resilver when the time to verify the checksums is included. However, unless there is additional pool damage, no checksum errors should be reported by the scrub. This feature is incompatible with raidz configurations. . This feature becomes .Sy active while a sequential resilver is in progress, and returns to .Sy enabled when the resilver completes. . .feature com.delphix device_removal no This feature enables the .Nm zpool Cm remove command to remove top-level vdevs, evacuating them to reduce the total size of the pool. .Pp This feature becomes .Sy active when the .Nm zpool Cm remove command is used on a top-level vdev, and will never return to being .Sy enabled . . .feature org.openzfs draid no This feature enables use of the .Sy draid vdev type. dRAID is a variant of RAID-Z which provides integrated distributed hot spares that allow faster resilvering while retaining the benefits of RAID-Z. Data, parity, and spare space are organized in redundancy groups and distributed evenly over all of the devices. .Pp This feature becomes .Sy active when creating a pool which uses the .Sy draid vdev type, or when adding a new .Sy draid vdev to an existing pool. . .feature org.illumos edonr no extensible_dataset This feature enables the use of the Edon-R hash algorithm for checksum, including for nopwrite .Po if compression is also enabled, an overwrite of a block whose checksum matches the data being written will be ignored .Pc . In an abundance of caution, Edon-R requires verification when used with dedup: .Nm zfs Cm set Sy dedup Ns = Ns Sy edonr , Ns Sy verify .Po see Xr zfs-set 8 Pc . .Pp Edon-R is a very high-performance hash algorithm that was part of the NIST SHA-3 competition. It provides extremely high hash performance .Pq over 350% faster than SHA-256 , but was not selected because of its unsuitability as a general purpose secure hash algorithm. This implementation utilizes the new salted checksumming functionality in ZFS, which means that the checksum is pre-seeded with a secret 256-bit random key .Pq stored on the pool before being fed the data block to be checksummed. Thus the produced checksums are unique to a given pool, preventing hash collision attacks on systems with dedup. .Pp .checksum-spiel edonr . .feature com.delphix embedded_data no This feature improves the performance and compression ratio of highly-compressible blocks. Blocks whose contents can compress to 112 bytes or smaller can take advantage of this feature. .Pp When this feature is enabled, the contents of highly-compressible blocks are stored in the block .Dq pointer itself .Po a misnomer in this case, as it contains the compressed data, rather than a pointer to its location on disk .Pc . Thus the space of the block .Pq one sector, typically 512 B or 4 KiB is saved, and no additional I/O is needed to read and write the data block. . \*[instant-never] . .feature com.delphix empty_bpobj yes This feature increases the performance of creating and using a large number of snapshots of a single filesystem or volume, and also reduces the disk space required. .Pp When there are many snapshots, each snapshot uses many Block Pointer Objects .Pq bpobjs to track blocks associated with that snapshot. However, in common use cases, most of these bpobjs are empty. This feature allows us to create each bpobj on-demand, thus eliminating the empty bpobjs. .Pp This feature is .Sy active while there are any filesystems, volumes, or snapshots which were created after enabling this feature. . .feature com.delphix enabled_txg yes Once this feature is enabled, ZFS records the transaction group number in which new features are enabled. This has no user-visible impact, but other features may depend on this feature. .Pp This feature becomes .Sy active as soon as it is enabled and will never return to being .Sy enabled . . .feature com.datto encryption no bookmark_v2 extensible_dataset This feature enables the creation and management of natively encrypted datasets. .Pp This feature becomes .Sy active when an encrypted dataset is created and will be returned to the .Sy enabled state when all datasets that use this feature are destroyed. . .feature com.delphix extensible_dataset no This feature allows more flexible use of internal ZFS data structures, and exists for other features to depend on. .Pp This feature will be .Sy active when the first dependent feature uses it, and will be returned to the .Sy enabled state when all datasets that use this feature are destroyed. . .feature com.joyent filesystem_limits yes extensible_dataset This feature enables filesystem and snapshot limits. These limits can be used to control how many filesystems and/or snapshots can be created at the point in the tree on which the limits are set. .Pp This feature is .Sy active once either of the limit properties has been set on a dataset and will never return to being .Sy enabled . . .feature com.delphix head_errlog no This feature enables the upgraded version of errlog, which required an on-disk error log format change. Now the error log of each head dataset is stored separately in the zap object and keyed by the head id. With this feature enabled, every dataset affected by an error block is listed in the output of .Nm zpool Cm status . In case of encrypted filesystems with unloaded keys we are unable to check their snapshots or clones for errors and these will not be reported. An "access denied" error will be reported. .Pp \*[instant-never] . .feature com.delphix hole_birth no enabled_txg This feature has/had bugs, the result of which is that, if you do a .Nm zfs Cm send Fl i .Pq or Fl R , No since it uses Fl i from an affected dataset, the receiving party will not see any checksum or other errors, but the resulting destination snapshot will not match the source. Its use by .Nm zfs Cm send Fl i has been disabled by default .Po see .Sy send_holes_without_birth_time in .Xr zfs 4 .Pc . .Pp This feature improves performance of incremental sends .Pq Nm zfs Cm send Fl i and receives for objects with many holes. The most common case of hole-filled objects is zvols. .Pp An incremental send stream from snapshot .Sy A No to snapshot Sy B contains information about every block that changed between .Sy A No and Sy B . Blocks which did not change between those snapshots can be identified and omitted from the stream using a piece of metadata called the .Dq block birth time , but birth times are not recorded for holes .Pq blocks filled only with zeroes . Since holes created after .Sy A No cannot be distinguished from holes created before Sy A , information about every hole in the entire filesystem or zvol is included in the send stream. .Pp For workloads where holes are rare this is not a problem. However, when incrementally replicating filesystems or zvols with many holes .Pq for example a zvol formatted with another filesystem a lot of time will be spent sending and receiving unnecessary information about holes that already exist on the receiving side. .Pp Once the .Sy hole_birth feature has been enabled the block birth times of all new holes will be recorded. Incremental sends between snapshots created after this feature is enabled will use this new metadata to avoid sending information about holes that already exist on the receiving side. .Pp \*[instant-never] . .feature org.open-zfs large_blocks no extensible_dataset This feature allows the record size on a dataset to be set larger than 128 KiB. .Pp This feature becomes .Sy active once a dataset contains a file with a block size larger than 128 KiB, and will return to being .Sy enabled once all filesystems that have ever had their recordsize larger than 128 KiB are destroyed. . .feature org.zfsonlinux large_dnode no extensible_dataset This feature allows the size of dnodes in a dataset to be set larger than 512 B. . This feature becomes .Sy active once a dataset contains an object with a dnode larger than 512 B, which occurs as a result of setting the .Sy dnodesize dataset property to a value other than .Sy legacy . The feature will return to being .Sy enabled once all filesystems that have ever contained a dnode larger than 512 B are destroyed. Large dnodes allow more data to be stored in the bonus buffer, thus potentially improving performance by avoiding the use of spill blocks. . .feature com.delphix livelist yes This feature allows clones to be deleted faster than the traditional method when a large number of random/sparse writes have been made to the clone. All blocks allocated and freed after a clone is created are tracked by the the clone's livelist which is referenced during the deletion of the clone. The feature is activated when a clone is created and remains .Sy active until all clones have been destroyed. . .feature com.delphix log_spacemap yes com.delphix:spacemap_v2 This feature improves performance for heavily-fragmented pools, especially when workloads are heavy in random-writes. It does so by logging all the metaslab changes on a single spacemap every TXG instead of scattering multiple writes to all the metaslab spacemaps. .Pp \*[instant-never] . .feature org.illumos lz4_compress no .Sy lz4 is a high-performance real-time compression algorithm that features significantly faster compression and decompression as well as a higher compression ratio than the older .Sy lzjb compression. Typically, .Sy lz4 compression is approximately 50% faster on compressible data and 200% faster on incompressible data than .Sy lzjb . It is also approximately 80% faster on decompression, while giving approximately a 10% better compression ratio. .Pp When the .Sy lz4_compress feature is set to .Sy enabled , the administrator can turn on .Sy lz4 compression on any dataset on the pool using the .Xr zfs-set 8 command. All newly written metadata will be compressed with the .Sy lz4 algorithm. .Pp \*[instant-never] . .feature com.joyent multi_vdev_crash_dump no This feature allows a dump device to be configured with a pool comprised of multiple vdevs. Those vdevs may be arranged in any mirrored or raidz configuration. .Pp When the .Sy multi_vdev_crash_dump feature is set to .Sy enabled , the administrator can use .Xr dumpadm 8 to configure a dump device on a pool comprised of multiple vdevs. .Pp Under .Fx and Linux this feature is unused, but registered for compatibility. New pools created on these systems will have the feature .Sy enabled but will never transition to .Sy active , as this functionality is not required for crash dump support. Existing pools where this feature is .Sy active can be imported. . .feature com.delphix obsolete_counts yes device_removal This feature is an enhancement of .Sy device_removal , which will over time reduce the memory used to track removed devices. When indirect blocks are freed or remapped, we note that their part of the indirect mapping is .Dq obsolete – no longer needed. .Pp This feature becomes .Sy active when the .Nm zpool Cm remove command is used on a top-level vdev, and will never return to being .Sy enabled . . .feature org.zfsonlinux project_quota yes extensible_dataset This feature allows administrators to account the spaces and objects usage information against the project identifier .Pq ID . .Pp The project ID is an object-based attribute. When upgrading an existing filesystem, objects without a project ID will be assigned a zero project ID. When this feature is enabled, newly created objects inherit their parent directories' project ID if the parent's inherit flag is set .Pq via Nm chattr Sy [+-]P No or Nm zfs Cm project Fl s Ns | Ns Fl C . Otherwise, the new object's project ID will be zero. An object's project ID can be changed at any time by the owner .Pq or privileged user via .Nm chattr Fl p Ar prjid or .Nm zfs Cm project Fl p Ar prjid . .Pp This feature will become .Sy active as soon as it is enabled and will never return to being .Sy disabled . \*[remount-upgrade] . .feature com.delphix redaction_bookmarks no bookmarks extensible_dataset This feature enables the use of redacted .Nm zfs Cm send Ns s , which create redaction bookmarks storing the list of blocks redacted by the send that created them. For more information about redacted sends, see .Xr zfs-send 8 . . .feature com.delphix redacted_datasets no extensible_dataset This feature enables the receiving of redacted .Nm zfs Cm send streams, which create redacted datasets when received. These datasets are missing some of their blocks, and so cannot be safely mounted, and their contents cannot be safely read. For more information about redacted receives, see .Xr zfs-send 8 . . .feature com.datto resilver_defer yes This feature allows ZFS to postpone new resilvers if an existing one is already in progress. Without this feature, any new resilvers will cause the currently running one to be immediately restarted from the beginning. .Pp This feature becomes .Sy active once a resilver has been deferred, and returns to being .Sy enabled when the deferred resilver begins. . .feature org.illumos sha512 no extensible_dataset This feature enables the use of the SHA-512/256 truncated hash algorithm .Pq FIPS 180-4 for checksum and dedup. The native 64-bit arithmetic of SHA-512 provides an approximate 50% performance boost over SHA-256 on 64-bit hardware and is thus a good minimum-change replacement candidate for systems where hash performance is important, but these systems cannot for whatever reason utilize the faster .Sy skein No and Sy edonr algorithms. .Pp .checksum-spiel sha512 . .feature org.illumos skein no extensible_dataset This feature enables the use of the Skein hash algorithm for checksum and dedup. Skein is a high-performance secure hash algorithm that was a finalist in the NIST SHA-3 competition. It provides a very high security margin and high performance on 64-bit hardware .Pq 80% faster than SHA-256 . This implementation also utilizes the new salted checksumming functionality in ZFS, which means that the checksum is pre-seeded with a secret 256-bit random key .Pq stored on the pool before being fed the data block to be checksummed. Thus the produced checksums are unique to a given pool, preventing hash collision attacks on systems with dedup. .Pp .checksum-spiel skein . .feature com.delphix spacemap_histogram yes This features allows ZFS to maintain more information about how free space is organized within the pool. If this feature is .Sy enabled , it will be activated when a new space map object is created, or an existing space map is upgraded to the new format, and never returns back to being .Sy enabled . . .feature com.delphix spacemap_v2 yes This feature enables the use of the new space map encoding which consists of two words .Pq instead of one whenever it is advantageous. The new encoding allows space maps to represent large regions of space more efficiently on-disk while also increasing their maximum addressable offset. .Pp This feature becomes .Sy active once it is .Sy enabled , and never returns back to being .Sy enabled . . .feature org.zfsonlinux userobj_accounting yes extensible_dataset This feature allows administrators to account the object usage information by user and group. .Pp \*[instant-never] \*[remount-upgrade] . .feature com.klarasystems vdev_zaps_v2 no This feature creates a ZAP object for the root vdev. .Pp This feature becomes active after the next .Nm zpool Cm import or .Nm zpool reguid . . Properties can be retrieved or set on the root vdev using .Nm zpool Cm get and .Nm zpool Cm set with .Sy root as the vdev name which is an alias for .Sy root-0 . .feature org.openzfs zilsaxattr yes extensible_dataset This feature enables .Sy xattr Ns = Ns Sy sa extended attribute logging in the ZIL. If enabled, extended attribute changes .Pq both Sy xattrdir Ns = Ns Sy dir No and Sy xattr Ns = Ns Sy sa are guaranteed to be durable if either the dataset had .Sy sync Ns = Ns Sy always set at the time the changes were made, or .Xr sync 2 is called on the dataset after the changes were made. .Pp This feature becomes .Sy active when a ZIL is created for at least one dataset and will be returned to the .Sy enabled state when it is destroyed for all datasets that use this feature. . .feature com.delphix zpool_checkpoint yes This feature enables the .Nm zpool Cm checkpoint command that can checkpoint the state of the pool at the time it was issued and later rewind back to it or discard it. .Pp This feature becomes .Sy active when the .Nm zpool Cm checkpoint command is used to checkpoint the pool. The feature will only return back to being .Sy enabled when the pool is rewound or the checkpoint has been discarded. . .feature org.freebsd zstd_compress no extensible_dataset .Sy zstd is a high-performance compression algorithm that features a combination of high compression ratios and high speed. Compared to .Sy gzip , .Sy zstd offers slightly better compression at much higher speeds. Compared to .Sy lz4 , .Sy zstd offers much better compression while being only modestly slower. Typically, .Sy zstd compression speed ranges from 250 to 500 MB/s per thread and decompression speed is over 1 GB/s per thread. .Pp When the .Sy zstd feature is set to .Sy enabled , the administrator can turn on .Sy zstd compression of any dataset using .Nm zfs Cm set Sy compress Ns = Ns Sy zstd Ar dset .Po see Xr zfs-set 8 Pc . This feature becomes .Sy active once a .Sy compress property has been set to .Sy zstd , and will return to being .Sy enabled once all filesystems that have ever had their .Sy compress property set to .Sy zstd are destroyed. .El . .Sh SEE ALSO .Xr zfs 8 , .Xr zpool 8 diff --git a/sys/contrib/openzfs/man/man8/.gitignore b/sys/contrib/openzfs/man/man8/.gitignore index f2fc702147e9..a468f9cbf9d3 100644 --- a/sys/contrib/openzfs/man/man8/.gitignore +++ b/sys/contrib/openzfs/man/man8/.gitignore @@ -1,2 +1,3 @@ /zed.8 /zfs-mount-generator.8 +/zfs_prepare_disk.8 diff --git a/sys/contrib/openzfs/man/man8/zfs_prepare_disk.8.in b/sys/contrib/openzfs/man/man8/zfs_prepare_disk.8.in new file mode 100644 index 000000000000..2a741531e415 --- /dev/null +++ b/sys/contrib/openzfs/man/man8/zfs_prepare_disk.8.in @@ -0,0 +1,70 @@ +.\" +.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). +.\" Copyright (C) 2023 Lawrence Livermore National Security, LLC. +.\" Refer to the OpenZFS git commit log for authoritative copyright attribution. +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License Version 1.0 (CDDL-1.0). +.\" You can obtain a copy of the license from the top-level file +.\" "OPENSOLARIS.LICENSE" or at . +.\" You may not use this file except in compliance with the license. +.\" +.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) +.\" +.Dd August 30, 2023 +.Dt ZFS_PREPARE_DISK 8 +.Os +. +.Sh NAME +.Nm zfs_prepare_disk +.Nd special script that gets run before bringing a disk into a pool +.Sh DESCRIPTION +.Nm +is an optional script that gets called by libzfs before bringing a disk into a +pool. +It can be modified by the user to run whatever commands are necessary to prepare +a disk for inclusion into the pool. +For example, users can add lines to +.Nm zfs_prepare_disk +to do things like update the drive's firmware or check the drive's health. +.Nm zfs_prepare_disk +is optional and can be removed if not needed. +libzfs will look for the script at @zfsexecdir@/zfs_prepare_disk. +. +.Ss Properties +.Nm zfs_prepare_disk +will be passed the following environment variables: +.sp +.Bl -tag -compact -width "VDEV_ENC_SYSFS_PATH" +. +.It Nm POOL_NAME +.No Name of the pool +.It Nm VDEV_PATH +.No Path to the disk (like /dev/sda) +.It Nm VDEV_PREPARE +.No Reason why the disk is being prepared for inclusion +('create', 'add', 'replace', or 'autoreplace'). +This can be useful if you only want the script to be run under certain actions. +.It Nm VDEV_UPATH +.No Path to one of the underlying devices for the +disk. +For multipath this would return one of the /dev/sd* paths to the disk. +If the device is not a device mapper device, then +.Nm VDEV_UPATH +just returns the same value as +.Nm VDEV_PATH +.It Nm VDEV_ENC_SYSFS_PATH +.No Path to the disk's enclosure sysfs path, if available +.El +.Pp +Note that some of these variables may have a blank value. +.Nm POOL_NAME +is blank at pool creation time, for example. +.Sh ENVIRONMENT +.Nm zfs_prepare_disk +runs with a limited $PATH. +.Sh EXIT STATUS +.Nm zfs_prepare_disk +should return 0 on success, non-zero otherwise. +If non-zero is returned, the disk will not be included in the pool. +. diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in index c132171592a8..b9c284a24418 100644 --- a/sys/contrib/openzfs/module/Kbuild.in +++ b/sys/contrib/openzfs/module/Kbuild.in @@ -1,498 +1,502 @@ # When integrated in to a monolithic kernel the spl module must appear # first. This ensures its module initialization function is run before # any of the other module initialization functions which depend on it. ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement ZFS_MODULE_CFLAGS += -Wmissing-prototypes ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @NO_FORMAT_ZERO_LENGTH@ ifneq ($(KBUILD_EXTMOD),) zfs_include = @abs_top_srcdir@/include icp_include = @abs_srcdir@/icp/include zstd_include = @abs_srcdir@/zstd/include ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include src = @abs_srcdir@ obj = @abs_builddir@ else zfs_include = $(srctree)/include/zfs icp_include = $(srctree)/$(src)/icp/include zstd_include = $(srctree)/$(src)/zstd/include ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h endif ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/kernel ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs ZFS_MODULE_CFLAGS += -I$(zfs_include) ZFS_MODULE_CPPFLAGS += -D_KERNEL ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ # KASAN enables -Werror=frame-larger-than=1024, which # breaks oh so many parts of our build. ifeq ($(CONFIG_KASAN),y) ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than= endif # Generated binary search code is particularly bad with this optimization. # Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c # is not affected when unrolling is done. # Disable it until the following upstream issue is resolved: # https://github.com/llvm/llvm-project/issues/62790 ifeq ($(CONFIG_X86),y) ifeq ($(CONFIG_CC_IS_CLANG),y) CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false endif endif ifneq ($(KBUILD_EXTMOD),) @CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include @CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ endif asflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) ifeq ($(CONFIG_ARM64),y) CFLAGS_REMOVE_zcommon/zfs_fletcher_aarch64_neon.o += -mgeneral-regs-only CFLAGS_REMOVE_zfs/vdev_raidz_math_aarch64_neon.o += -mgeneral-regs-only CFLAGS_REMOVE_zfs/vdev_raidz_math_aarch64_neonx2.o += -mgeneral-regs-only endif # Suppress unused-value warnings in sparc64 architecture headers ccflags-$(CONFIG_SPARC64) += -Wno-unused-value obj-$(CONFIG_ZFS) := spl.o zfs.o SPL_OBJS := \ spl-atomic.o \ spl-condvar.o \ spl-cred.o \ spl-err.o \ spl-generic.o \ spl-kmem-cache.o \ spl-kmem.o \ spl-kstat.o \ spl-proc.o \ spl-procfs-list.o \ spl-taskq.o \ spl-thread.o \ spl-trace.o \ spl-tsd.o \ spl-vmem.o \ spl-xdr.o \ spl-zlib.o \ spl-zone.o spl-objs += $(addprefix os/linux/spl/,$(SPL_OBJS)) zfs-objs += avl/avl.o ICP_OBJS := \ algs/aes/aes_impl.o \ algs/aes/aes_impl_generic.o \ algs/aes/aes_modes.o \ algs/blake3/blake3.o \ algs/blake3/blake3_generic.o \ algs/blake3/blake3_impl.o \ algs/edonr/edonr.o \ algs/modes/cbc.o \ algs/modes/ccm.o \ algs/modes/ctr.o \ algs/modes/ecb.o \ algs/modes/gcm.o \ algs/modes/gcm_generic.o \ algs/modes/modes.o \ algs/sha2/sha2_generic.o \ algs/sha2/sha256_impl.o \ algs/sha2/sha512_impl.o \ algs/skein/skein.o \ algs/skein/skein_block.o \ algs/skein/skein_iv.o \ api/kcf_cipher.o \ api/kcf_ctxops.o \ api/kcf_mac.o \ core/kcf_callprov.o \ core/kcf_mech_tabs.o \ core/kcf_prov_lib.o \ core/kcf_prov_tabs.o \ core/kcf_sched.o \ illumos-crypto.o \ io/aes.o \ io/sha2_mod.o \ io/skein_mod.o \ spi/kcf_spi.o ICP_OBJS_X86_64 := \ asm-x86_64/aes/aes_aesni.o \ asm-x86_64/aes/aes_amd64.o \ asm-x86_64/aes/aeskey.o \ asm-x86_64/blake3/blake3_avx2.o \ asm-x86_64/blake3/blake3_avx512.o \ asm-x86_64/blake3/blake3_sse2.o \ asm-x86_64/blake3/blake3_sse41.o \ asm-x86_64/sha2/sha256-x86_64.o \ asm-x86_64/sha2/sha512-x86_64.o \ asm-x86_64/modes/aesni-gcm-x86_64.o \ asm-x86_64/modes/gcm_pclmulqdq.o \ asm-x86_64/modes/ghash-x86_64.o ICP_OBJS_X86 := \ algs/aes/aes_impl_aesni.o \ algs/aes/aes_impl_x86-64.o \ algs/modes/gcm_pclmulqdq.o ICP_OBJS_ARM := \ asm-arm/sha2/sha256-armv7.o \ asm-arm/sha2/sha512-armv7.o ICP_OBJS_ARM64 := \ asm-aarch64/blake3/b3_aarch64_sse2.o \ asm-aarch64/blake3/b3_aarch64_sse41.o \ asm-aarch64/sha2/sha256-armv8.o \ asm-aarch64/sha2/sha512-armv8.o ICP_OBJS_PPC_PPC64 := \ asm-ppc64/blake3/b3_ppc64le_sse2.o \ asm-ppc64/blake3/b3_ppc64le_sse41.o \ asm-ppc64/sha2/sha256-p8.o \ asm-ppc64/sha2/sha512-p8.o \ asm-ppc64/sha2/sha256-ppc.o \ asm-ppc64/sha2/sha512-ppc.o zfs-objs += $(addprefix icp/,$(ICP_OBJS)) zfs-$(CONFIG_X86) += $(addprefix icp/,$(ICP_OBJS_X86)) zfs-$(CONFIG_UML_X86)+= $(addprefix icp/,$(ICP_OBJS_X86)) zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64)) zfs-$(CONFIG_ARM) += $(addprefix icp/,$(ICP_OBJS_ARM)) zfs-$(CONFIG_ARM64) += $(addprefix icp/,$(ICP_OBJS_ARM64)) zfs-$(CONFIG_PPC) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) zfs-$(CONFIG_PPC64) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : asflags-y += -I$(icp_include) -I$(zfs_include)/os/linux/spl -I$(zfs_include) $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : ccflags-y += -I$(icp_include) -I$(zfs_include)/os/linux/spl -I$(zfs_include) # Suppress objtool "return with modified stack frame" warnings. OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y # Suppress objtool "unsupported stack pointer realignment" warnings. # See #6950 for the reasoning. OBJECT_FILES_NON_STANDARD_sha256-x86_64.o := y OBJECT_FILES_NON_STANDARD_sha512-x86_64.o := y LUA_OBJS := \ lapi.o \ lauxlib.o \ lbaselib.o \ lcode.o \ lcompat.o \ lcorolib.o \ lctype.o \ ldebug.o \ ldo.o \ lfunc.o \ lgc.o \ llex.o \ lmem.o \ lobject.o \ lopcodes.o \ lparser.o \ lstate.o \ lstring.o \ lstrlib.o \ ltable.o \ ltablib.o \ ltm.o \ lvm.o \ lzio.o \ setjmp/setjmp.o zfs-objs += $(addprefix lua/,$(LUA_OBJS)) NVPAIR_OBJS := \ fnvpair.o \ nvpair.o \ nvpair_alloc_fixed.o \ nvpair_alloc_spl.o zfs-objs += $(addprefix nvpair/,$(NVPAIR_OBJS)) UNICODE_OBJS := \ u8_textprep.o \ uconv.o zfs-objs += $(addprefix unicode/,$(UNICODE_OBJS)) ZCOMMON_OBJS := \ cityhash.o \ zfeature_common.o \ zfs_comutil.o \ zfs_deleg.o \ zfs_fletcher.o \ zfs_fletcher_superscalar.o \ zfs_fletcher_superscalar4.o \ zfs_namecheck.o \ zfs_prop.o \ zpool_prop.o \ zprop_common.o ZCOMMON_OBJS_X86 := \ zfs_fletcher_avx512.o \ zfs_fletcher_intel.o \ zfs_fletcher_sse.o ZCOMMON_OBJS_ARM64 := \ zfs_fletcher_aarch64_neon.o zfs-objs += $(addprefix zcommon/,$(ZCOMMON_OBJS)) zfs-$(CONFIG_X86) += $(addprefix zcommon/,$(ZCOMMON_OBJS_X86)) zfs-$(CONFIG_UML_X86)+= $(addprefix zcommon/,$(ZCOMMON_OBJS_X86)) zfs-$(CONFIG_ARM64) += $(addprefix zcommon/,$(ZCOMMON_OBJS_ARM64)) # Zstd uses -O3 by default, so we should follow ZFS_ZSTD_FLAGS := -O3 # -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h # Set it for other compilers, too. ZFS_ZSTD_FLAGS += -fno-tree-vectorize # SSE register return with SSE disabled if -march=znverX is passed ZFS_ZSTD_FLAGS += -U__BMI__ # Quiet warnings about frame size due to unused code in unmodified zstd lib ZFS_ZSTD_FLAGS += -Wframe-larger-than=20480 ZSTD_OBJS := \ zfs_zstd.o \ zstd_sparc.o ZSTD_UPSTREAM_OBJS := \ lib/common/entropy_common.o \ lib/common/error_private.o \ lib/common/fse_decompress.o \ lib/common/pool.o \ lib/common/zstd_common.o \ lib/compress/fse_compress.o \ lib/compress/hist.o \ lib/compress/huf_compress.o \ lib/compress/zstd_compress.o \ lib/compress/zstd_compress_literals.o \ lib/compress/zstd_compress_sequences.o \ lib/compress/zstd_compress_superblock.o \ lib/compress/zstd_double_fast.o \ lib/compress/zstd_fast.o \ lib/compress/zstd_lazy.o \ lib/compress/zstd_ldm.o \ lib/compress/zstd_opt.o \ lib/decompress/huf_decompress.o \ lib/decompress/zstd_ddict.o \ lib/decompress/zstd_decompress.o \ lib/decompress/zstd_decompress_block.o zfs-objs += $(addprefix zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) # Disable aarch64 neon SIMD instructions for kernel mode $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -I$(zstd_include) $(ZFS_ZSTD_FLAGS) $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : asflags-y += -I$(zstd_include) $(addprefix $(obj)/zstd/,$(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w $(obj)/zstd/zfs_zstd.o : ccflags-y += -include $(zstd_include)/zstd_compat_wrapper.h ZFS_OBJS := \ abd.o \ aggsum.o \ arc.o \ blake3_zfs.o \ blkptr.o \ bplist.o \ bpobj.o \ bptree.o \ bqueue.o \ brt.o \ btree.o \ dataset_kstats.o \ dbuf.o \ dbuf_stats.o \ ddt.o \ ddt_zap.o \ dmu.o \ dmu_diff.o \ dmu_object.o \ dmu_objset.o \ dmu_recv.o \ dmu_redact.o \ dmu_send.o \ dmu_traverse.o \ dmu_tx.o \ dmu_zfetch.o \ dnode.o \ dnode_sync.o \ dsl_bookmark.o \ dsl_crypt.o \ dsl_dataset.o \ dsl_deadlist.o \ dsl_deleg.o \ dsl_destroy.o \ dsl_dir.o \ dsl_pool.o \ dsl_prop.o \ dsl_scan.o \ dsl_synctask.o \ dsl_userhold.o \ edonr_zfs.o \ fm.o \ gzip.o \ hkdf.o \ lz4.o \ lz4_zfs.o \ lzjb.o \ metaslab.o \ mmp.o \ multilist.o \ objlist.o \ pathname.o \ range_tree.o \ refcount.o \ rrwlock.o \ sa.o \ sha2_zfs.o \ skein_zfs.o \ spa.o \ spa_checkpoint.o \ spa_config.o \ spa_errlog.o \ spa_history.o \ spa_log_spacemap.o \ spa_misc.o \ spa_stats.o \ space_map.o \ space_reftree.o \ txg.o \ uberblock.o \ unique.o \ vdev.o \ vdev_draid.o \ vdev_draid_rand.o \ vdev_indirect.o \ vdev_indirect_births.o \ vdev_indirect_mapping.o \ vdev_initialize.o \ vdev_label.o \ vdev_mirror.o \ vdev_missing.o \ vdev_queue.o \ vdev_raidz.o \ vdev_raidz_math.o \ vdev_raidz_math_scalar.o \ vdev_rebuild.o \ vdev_removal.o \ vdev_root.o \ vdev_trim.o \ zap.o \ zap_leaf.o \ zap_micro.o \ zcp.o \ zcp_get.o \ zcp_global.o \ zcp_iter.o \ zcp_set.o \ zcp_synctask.o \ zfeature.o \ zfs_byteswap.o \ zfs_chksum.o \ zfs_fm.o \ zfs_fuid.o \ zfs_impl.o \ zfs_ioctl.o \ zfs_log.o \ zfs_onexit.o \ zfs_quota.o \ zfs_ratelimit.o \ zfs_replay.o \ zfs_rlock.o \ zfs_sa.o \ zfs_vnops.o \ zil.o \ zio.o \ zio_checksum.o \ zio_compress.o \ zio_inject.o \ zle.o \ zrlock.o \ zthr.o \ zvol.o ZFS_OBJS_OS := \ abd_os.o \ arc_os.o \ mmp_os.o \ policy.o \ qat.o \ qat_compress.o \ qat_crypt.o \ spa_misc_os.o \ trace.o \ vdev_disk.o \ vdev_file.o \ zfs_acl.o \ zfs_ctldir.o \ zfs_debug.o \ zfs_dir.o \ zfs_file_os.o \ zfs_ioctl_os.o \ zfs_racct.o \ zfs_sysfs.o \ zfs_uio.o \ zfs_vfsops.o \ zfs_vnops_os.o \ zfs_znode.o \ zio_crypt.o \ zpl_ctldir.o \ zpl_export.o \ zpl_file.o \ zpl_file_range.o \ zpl_inode.o \ zpl_super.o \ zpl_xattr.o \ zvol_os.o ZFS_OBJS_X86 := \ vdev_raidz_math_avx2.o \ vdev_raidz_math_avx512bw.o \ vdev_raidz_math_avx512f.o \ vdev_raidz_math_sse2.o \ vdev_raidz_math_ssse3.o ZFS_OBJS_ARM64 := \ vdev_raidz_math_aarch64_neon.o \ vdev_raidz_math_aarch64_neonx2.o ZFS_OBJS_PPC_PPC64 := \ vdev_raidz_math_powerpc_altivec.o zfs-objs += $(addprefix zfs/,$(ZFS_OBJS)) $(addprefix os/linux/zfs/,$(ZFS_OBJS_OS)) zfs-$(CONFIG_X86) += $(addprefix zfs/,$(ZFS_OBJS_X86)) zfs-$(CONFIG_UML_X86)+= $(addprefix zfs/,$(ZFS_OBJS_X86)) zfs-$(CONFIG_ARM64) += $(addprefix zfs/,$(ZFS_OBJS_ARM64)) zfs-$(CONFIG_PPC) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) +UBSAN_SANITIZE_zap_leaf.o := n +UBSAN_SANITIZE_zap_micro.o := n +UBSAN_SANITIZE_sa.o := n + # Suppress incorrect warnings from versions of objtool which are not # aware of x86 EVEX prefix instructions used for AVX512. OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y ifeq ($(CONFIG_ALTIVEC),y) $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec endif diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c index b31810d57f59..3fba5ed3c228 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c @@ -1,449 +1,447 @@ /* * Copyright (c) 2009 Pawel Jakub Dawidek * All rights reserved. * * Copyright (c) 2012 Spectra Logic Corporation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include -#include -#include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) || defined(__aarch64__) #include #endif #include #if __FreeBSD_version < 1201522 #define taskqueue_start_threads_in_proc(tqp, count, pri, proc, name, ...) \ taskqueue_start_threads(tqp, count, pri, name, __VA_ARGS__) #endif static uint_t taskq_tsd; static uma_zone_t taskq_zone; /* * Global system-wide dynamic task queue available for all consumers. This * taskq is not intended for long-running tasks; instead, a dedicated taskq * should be created. */ taskq_t *system_taskq = NULL; taskq_t *system_delay_taskq = NULL; taskq_t *dynamic_taskq = NULL; proc_t *system_proc; -extern int uma_align_cache; - static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures"); -static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; +static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; static unsigned long tqenthash; static unsigned long tqenthashlock; static struct sx *tqenthashtbl_lock; static taskqid_t tqidnext; #define TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash]) #define TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)]) +#define NORMAL_TASK 0 #define TIMEOUT_TASK 1 -#define NORMAL_TASK 2 static void system_taskq_init(void *arg) { int i; tsd_create(&taskq_tsd, NULL); tqenthashtbl = hashinit(mp_ncpus * 8, M_TASKQ, &tqenthash); tqenthashlock = (tqenthash + 1) / 8; if (tqenthashlock > 0) tqenthashlock--; tqenthashtbl_lock = malloc(sizeof (*tqenthashtbl_lock) * (tqenthashlock + 1), M_TASKQ, M_WAITOK | M_ZERO); for (i = 0; i < tqenthashlock + 1; i++) sx_init_flags(&tqenthashtbl_lock[i], "tqenthash", SX_DUPOK); taskq_zone = uma_zcreate("taskq_zone", sizeof (taskq_ent_t), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); system_taskq = taskq_create("system_taskq", mp_ncpus, minclsyspri, 0, 0, 0); system_delay_taskq = taskq_create("system_delay_taskq", mp_ncpus, minclsyspri, 0, 0, 0); } SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init, NULL); static void system_taskq_fini(void *arg) { int i; taskq_destroy(system_delay_taskq); taskq_destroy(system_taskq); uma_zdestroy(taskq_zone); tsd_destroy(&taskq_tsd); for (i = 0; i < tqenthashlock + 1; i++) sx_destroy(&tqenthashtbl_lock[i]); for (i = 0; i < tqenthash + 1; i++) - VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i])); + VERIFY(LIST_EMPTY(&tqenthashtbl[i])); free(tqenthashtbl_lock, M_TASKQ); free(tqenthashtbl, M_TASKQ); } SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini, NULL); #ifdef __LP64__ static taskqid_t __taskq_genid(void) { taskqid_t tqid; /* * Assume a 64-bit counter will not wrap in practice. */ tqid = atomic_add_64_nv(&tqidnext, 1); VERIFY(tqid); return (tqid); } #else static taskqid_t __taskq_genid(void) { taskqid_t tqid; for (;;) { tqid = atomic_add_32_nv(&tqidnext, 1); if (__predict_true(tqid != 0)) break; } VERIFY(tqid); return (tqid); } #endif static taskq_ent_t * taskq_lookup(taskqid_t tqid) { taskq_ent_t *ent = NULL; - sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) { + if (tqid == 0) + return (NULL); + sx_slock(TQIDHASHLOCK(tqid)); + LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) { if (ent->tqent_id == tqid) break; } if (ent != NULL) refcount_acquire(&ent->tqent_rc); - sx_xunlock(TQIDHASHLOCK(tqid)); + sx_sunlock(TQIDHASHLOCK(tqid)); return (ent); } static taskqid_t taskq_insert(taskq_ent_t *ent) { - taskqid_t tqid; + taskqid_t tqid = __taskq_genid(); - tqid = __taskq_genid(); ent->tqent_id = tqid; - ent->tqent_registered = B_TRUE; sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash); + LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash); sx_xunlock(TQIDHASHLOCK(tqid)); return (tqid); } static void taskq_remove(taskq_ent_t *ent) { taskqid_t tqid = ent->tqent_id; - if (!ent->tqent_registered) + if (tqid == 0) return; - sx_xlock(TQIDHASHLOCK(tqid)); - CK_LIST_REMOVE(ent, tqent_hash); + if (ent->tqent_id != 0) { + LIST_REMOVE(ent, tqent_hash); + ent->tqent_id = 0; + } sx_xunlock(TQIDHASHLOCK(tqid)); - ent->tqent_registered = B_FALSE; } static void taskq_tsd_set(void *context) { taskq_t *tq = context; #if defined(__amd64__) || defined(__aarch64__) if (context != NULL && tsd_get(taskq_tsd) == NULL) fpu_kern_thread(FPU_KERN_NORMAL); #endif tsd_set(taskq_tsd, tq); } static taskq_t * taskq_create_impl(const char *name, int nthreads, pri_t pri, proc_t *proc __maybe_unused, uint_t flags) { taskq_t *tq; if ((flags & TASKQ_THREADS_CPU_PCT) != 0) nthreads = MAX((mp_ncpus * nthreads) / 100, 1); tq = kmem_alloc(sizeof (*tq), KM_SLEEP); tq->tq_queue = taskqueue_create(name, M_WAITOK, taskqueue_thread_enqueue, &tq->tq_queue); taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT, taskq_tsd_set, tq); taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN, taskq_tsd_set, NULL); (void) taskqueue_start_threads_in_proc(&tq->tq_queue, nthreads, pri, proc, "%s", name); return ((taskq_t *)tq); } taskq_t * taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused, int maxalloc __unused, uint_t flags) { return (taskq_create_impl(name, nthreads, pri, system_proc, flags)); } taskq_t * taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc __unused, int maxalloc __unused, proc_t *proc, uint_t flags) { return (taskq_create_impl(name, nthreads, pri, proc, flags)); } void taskq_destroy(taskq_t *tq) { taskqueue_free(tq->tq_queue); kmem_free(tq, sizeof (*tq)); } int taskq_member(taskq_t *tq, kthread_t *thread) { return (taskqueue_member(tq->tq_queue, thread)); } taskq_t * taskq_of_curthread(void) { return (tsd_get(taskq_tsd)); } static void taskq_free(taskq_ent_t *task) { taskq_remove(task); if (refcount_release(&task->tqent_rc)) uma_zfree(taskq_zone, task); } int taskq_cancel_id(taskq_t *tq, taskqid_t tid) { uint32_t pend; int rc; taskq_ent_t *ent; - if (tid == 0) - return (0); - if ((ent = taskq_lookup(tid)) == NULL) return (0); - ent->tqent_cancelled = B_TRUE; - if (ent->tqent_type == TIMEOUT_TASK) { + if (ent->tqent_type == NORMAL_TASK) { + rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); + if (rc == EBUSY) + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + } else { rc = taskqueue_cancel_timeout(tq->tq_queue, &ent->tqent_timeout_task, &pend); - } else - rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); - if (rc == EBUSY) { - taskqueue_drain(tq->tq_queue, &ent->tqent_task); - } else if (pend) { + if (rc == EBUSY) { + taskqueue_drain_timeout(tq->tq_queue, + &ent->tqent_timeout_task); + } + } + if (pend) { /* * Tasks normally free themselves when run, but here the task * was cancelled so it did not free itself. */ taskq_free(ent); } /* Free the extra reference we added with taskq_lookup. */ taskq_free(ent); return (rc); } static void -taskq_run(void *arg, int pending __unused) +taskq_run(void *arg, int pending) { taskq_ent_t *task = arg; - if (!task->tqent_cancelled) - task->tqent_func(task->tqent_arg); + if (pending == 0) + return; + task->tqent_func(task->tqent_arg); taskq_free(task); } taskqid_t taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, uint_t flags, clock_t expire_time) { taskq_ent_t *task; taskqid_t tqid; clock_t timo; int mflag; timo = expire_time - ddi_get_lbolt(); if (timo <= 0) return (taskq_dispatch(tq, func, arg, flags)); if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP) mflag = M_WAITOK; else mflag = M_NOWAIT; task = uma_zalloc(taskq_zone, mflag); if (task == NULL) return (0); task->tqent_func = func; task->tqent_arg = arg; task->tqent_type = TIMEOUT_TASK; - task->tqent_cancelled = B_FALSE; refcount_init(&task->tqent_rc, 1); tqid = taskq_insert(task); TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0, taskq_run, task); taskqueue_enqueue_timeout(tq->tq_queue, &task->tqent_timeout_task, timo); return (tqid); } taskqid_t taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) { taskq_ent_t *task; int mflag, prio; taskqid_t tqid; if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP) mflag = M_WAITOK; else mflag = M_NOWAIT; /* * If TQ_FRONT is given, we want higher priority for this task, so it * can go at the front of the queue. */ prio = !!(flags & TQ_FRONT); task = uma_zalloc(taskq_zone, mflag); if (task == NULL) return (0); refcount_init(&task->tqent_rc, 1); task->tqent_func = func; task->tqent_arg = arg; - task->tqent_cancelled = B_FALSE; task->tqent_type = NORMAL_TASK; tqid = taskq_insert(task); TASK_INIT(&task->tqent_task, prio, taskq_run, task); taskqueue_enqueue(tq->tq_queue, &task->tqent_task); return (tqid); } static void -taskq_run_ent(void *arg, int pending __unused) +taskq_run_ent(void *arg, int pending) { taskq_ent_t *task = arg; + if (pending == 0) + return; task->tqent_func(task->tqent_arg); } void taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, taskq_ent_t *task) { int prio; /* * If TQ_FRONT is given, we want higher priority for this task, so it * can go at the front of the queue. */ prio = !!(flags & TQ_FRONT); - task->tqent_cancelled = B_FALSE; - task->tqent_registered = B_FALSE; task->tqent_id = 0; task->tqent_func = func; task->tqent_arg = arg; TASK_INIT(&task->tqent_task, prio, taskq_run_ent, task); taskqueue_enqueue(tq->tq_queue, &task->tqent_task); } void taskq_wait(taskq_t *tq) { taskqueue_quiesce(tq->tq_queue); } void taskq_wait_id(taskq_t *tq, taskqid_t tid) { taskq_ent_t *ent; - if (tid == 0) - return; if ((ent = taskq_lookup(tid)) == NULL) return; - taskqueue_drain(tq->tq_queue, &ent->tqent_task); + if (ent->tqent_type == NORMAL_TASK) + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + else + taskqueue_drain_timeout(tq->tq_queue, &ent->tqent_timeout_task); taskq_free(ent); } void taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused) { taskqueue_drain_all(tq->tq_queue); } int taskq_empty_ent(taskq_ent_t *t) { return (t->tqent_task.ta_pending == 0); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c index 12f16edb1e2b..92696c0bf1ae 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c @@ -1,258 +1,196 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#if __FreeBSD_version >= 1300139 -static struct sx arc_vnlru_lock; -static struct vnode *arc_vnlru_marker; -#endif - extern struct vfsops zfs_vfsops; uint_t zfs_arc_free_target = 0; static void arc_free_target_init(void *unused __unused) { zfs_arc_free_target = vm_cnt.v_free_target; } SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, arc_free_target_init, NULL); /* * We don't have a tunable for arc_free_target due to the dependency on * pagedaemon initialisation. */ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, free_target, param_set_arc_free_target, 0, CTLFLAG_RW, "Desired number of free pages below which ARC triggers reclaim"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, no_grow_shift, param_set_arc_no_grow_shift, 0, ZMOD_RW, "log2(fraction of ARC which must be free to allow growing)"); int64_t arc_available_memory(void) { int64_t lowest = INT64_MAX; int64_t n __unused; /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); if (n < lowest) { lowest = n; } #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical * memory. Most checks of the size of the heap_area compare against * tune.t_minarmem, which is the minimum available real memory that we * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ n = uma_avail() - (long)(uma_limit() / 4); if (n < lowest) { lowest = n; } #endif DTRACE_PROBE1(arc__available_memory, int64_t, lowest); return (lowest); } /* * Return a default max arc size based on the amount of physical memory. */ uint64_t arc_default_max(uint64_t min, uint64_t allmem) { uint64_t size; if (allmem >= 1 << 30) size = allmem - (1 << 30); else size = min; return (MAX(allmem * 5 / 8, size)); } -/* - * Helper function for arc_prune_async() it is responsible for safely - * handling the execution of a registered arc_prune_func_t. - */ -static void -arc_prune_task(void *arg) -{ - uint64_t nr_scan = (uintptr_t)arg; - -#ifndef __ILP32__ - if (nr_scan > INT_MAX) - nr_scan = INT_MAX; -#endif - -#if __FreeBSD_version >= 1300139 - sx_xlock(&arc_vnlru_lock); - vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker); - sx_xunlock(&arc_vnlru_lock); -#else - vnlru_free(nr_scan, &zfs_vfsops); -#endif -} - -/* - * Notify registered consumers they must drop holds on a portion of the ARC - * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the metadata limit and reclaim otherwise pinned ARC buffers. This - * is analogous to dnlc_reduce_cache() but more generic. - * - * This operation is performed asynchronously so it may be safely called - * in the context of the arc_reclaim_thread(). A reference is taken here - * for each registered arc_prune_t and the arc_prune_task() is responsible - * for releasing it once the registered arc_prune_func_t has completed. - */ -void -arc_prune_async(uint64_t adjust) -{ - -#ifndef __LP64__ - if (adjust > UINTPTR_MAX) - adjust = UINTPTR_MAX; -#endif - taskq_dispatch(arc_prune_taskq, arc_prune_task, - (void *)(intptr_t)adjust, TQ_SLEEP); - ARCSTAT_BUMP(arcstat_prune); -} - uint64_t arc_all_memory(void) { return (ptob(physmem)); } int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) { return (0); } uint64_t arc_free_memory(void) { return (ptob(freemem)); } static eventhandler_tag arc_event_lowmem = NULL; static void arc_lowmem(void *arg __unused, int howto __unused) { int64_t free_memory, to_free; arc_no_grow = B_TRUE; arc_warm = B_TRUE; arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); free_memory = arc_available_memory(); int64_t can_free = arc_c - arc_c_min; if (can_free <= 0) return; to_free = (can_free >> arc_shrink_shift) - MIN(free_memory, 0); DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); arc_reduce_target_size(to_free); /* * It is unsafe to block here in arbitrary threads, because we can come * here from ARC itself and may hold ARC locks and thus risk a deadlock * with ARC reclaim thread. */ if (curproc == pageproc) arc_wait_for_eviction(to_free, B_FALSE); } void arc_lowmem_init(void) { arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, EVENTHANDLER_PRI_FIRST); -#if __FreeBSD_version >= 1300139 - arc_vnlru_marker = vnlru_alloc_marker(); - sx_init(&arc_vnlru_lock, "arc vnlru lock"); -#endif } void arc_lowmem_fini(void) { if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); -#if __FreeBSD_version >= 1300139 - if (arc_vnlru_marker != NULL) { - vnlru_free_marker(arc_vnlru_marker); - sx_destroy(&arc_vnlru_lock); - } -#endif } void arc_register_hotplug(void) { } void arc_unregister_hotplug(void) { } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c index 8969fd6a54bd..23b8da184535 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -1,2519 +1,2551 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" #ifndef MNTK_VMSETSIZE_BUG #define MNTK_VMSETSIZE_BUG 0 #endif #ifndef MNTK_NOMSYNC #define MNTK_NOMSYNC 8 #endif struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owners can perform privileged operation on file systems"); int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); -int zfs_bclone_enabled; +int zfs_bclone_enabled = 0; SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN, &zfs_bclone_enabled, 0, "Enable block cloning"); struct zfs_jailparam { int mount_snapshot; }; static struct zfs_jailparam zfs_jailparam0 = { .mount_snapshot = 0, }; static int zfs_jailparam_slot; SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters"); SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I", "Allow mounting snapshots in the .zfs directory for unjailed datasets"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); #if __FreeBSD_version >= 1400018 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy); #else static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); #endif static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor); #if __FreeBSD_version >= 1300098 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, struct ucred **credanonp, int *numsecflavors, int *secflavors); #else static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); #endif static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); static void zfs_freevfs(vfs_t *vfsp); struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, #if __FreeBSD_version >= 1300049 .vfs_root = vfs_cache_root, .vfs_cachedroot = zfs_root, #else .vfs_root = zfs_root, #endif .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_checkexp = zfs_checkexp, .vfs_fhtovp = zfs_fhtovp, .vfs_quotactl = zfs_quotactl, }; #ifdef VFCF_CROSS_COPY_FILE_RANGE VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE); #else VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL); #endif /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint) { int error; zfsvfs_t *zfvp; vfs_t *vfsp; objset_t *os; uint64_t tmp = *val; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, &zfvp); if (error != 0) return (error); if (zfvp == NULL) return (ENOENT); vfsp = zfvp->z_vfs; switch (zfs_prop) { case ZFS_PROP_ATIME: if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) tmp = 1; break; case ZFS_PROP_DEVICES: if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) tmp = 1; break; case ZFS_PROP_EXEC: if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) tmp = 1; break; case ZFS_PROP_SETUID: if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) tmp = 1; break; case ZFS_PROP_READONLY: if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) tmp = 1; break; case ZFS_PROP_XATTR: if (zfvp->z_flags & ZSB_XATTR) tmp = zfvp->z_xattr; break; case ZFS_PROP_NBMAND: if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) tmp = 1; break; default: vfs_unbusy(vfsp); return (ENOENT); } vfs_unbusy(vfsp); if (tmp != *val) { if (setpoint) (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); } static int zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) { int error = 0; char buf[32]; uint64_t usedobj, quotaobj; uint64_t quota, used = 0; timespec_t now; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; if (quotaobj == 0 || zfsvfs->z_replay) { error = ENOENT; goto done; } (void) sprintf(buf, "%llx", (longlong_t)id); if ((error = zap_lookup(zfsvfs->z_os, quotaobj, buf, sizeof (quota), 1, "a)) != 0) { dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__); goto done; } /* * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". * So we set them to be the same. */ dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); if (error && error != ENOENT) { dprintf("%s(%d): usedobj failed; %d\n", __FUNCTION__, __LINE__, error); goto done; } dqp->dqb_curblocks = btodb(used); dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; vfs_timestamp(&now); /* * Setting this to 0 causes FreeBSD quota(8) to print * the number of days since the epoch, which isn't * particularly useful. */ dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; done: return (error); } static int #if __FreeBSD_version >= 1400018 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy) #else zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) #endif { zfsvfs_t *zfsvfs = vfsp->vfs_data; struct thread *td; int cmd, type, error = 0; int bitsize; zfs_userquota_prop_t quota_type; struct dqblk64 dqblk = { 0 }; td = curthread; cmd = cmds >> SUBCMDSHIFT; type = cmds & SUBCMDMASK; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (id == -1) { switch (type) { case USRQUOTA: id = td->td_ucred->cr_ruid; break; case GRPQUOTA: id = td->td_ucred->cr_rgid; break; default: error = EINVAL; #if __FreeBSD_version < 1400018 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(vfsp); #endif goto done; } } /* * Map BSD type to: * ZFS_PROP_USERUSED, * ZFS_PROP_USERQUOTA, * ZFS_PROP_GROUPUSED, * ZFS_PROP_GROUPQUOTA */ switch (cmd) { case Q_SETQUOTA: case Q_SETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERQUOTA; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPQUOTA; else error = EINVAL; break; case Q_GETQUOTA: case Q_GETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERUSED; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPUSED; else error = EINVAL; break; } /* * Depending on the cmd, we may need to get * the ruid and domain (see fuidstr_to_sid?), * the fuid (how?), or other information. * Create fuid using zfs_fuid_create(zfsvfs, id, * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? * I think I can use just the id? * * Look at zfs_id_overquota() to look up a quota. * zap_lookup(something, quotaobj, fuidstring, * sizeof (long long), 1, "a) * * See zfs_set_userquota() to set a quota. */ if ((uint32_t)type >= MAXQUOTAS) { error = EINVAL; goto done; } switch (cmd) { case Q_GETQUOTASIZE: bitsize = 64; error = copyout(&bitsize, arg, sizeof (int)); break; case Q_QUOTAON: // As far as I can tell, you can't turn quotas on or off on zfs error = 0; #if __FreeBSD_version < 1400018 vfs_unbusy(vfsp); #endif break; case Q_QUOTAOFF: error = ENOTSUP; #if __FreeBSD_version < 1400018 vfs_unbusy(vfsp); #endif break; case Q_SETQUOTA: error = copyin(arg, &dqblk, sizeof (dqblk)); if (error == 0) error = zfs_set_userquota(zfsvfs, quota_type, "", id, dbtob(dqblk.dqb_bhardlimit)); break; case Q_GETQUOTA: error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); if (error == 0) error = copyout(&dqblk, arg, sizeof (dqblk)); break; default: error = EINVAL; break; } done: zfs_exit(zfsvfs, FTAG); return (error); } boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs) { return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); } static int zfs_sync(vfs_t *vfsp, int waitfor) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); /* * Ignore the system syncher. ZFS already commits async data * at zfs_txg_timeout intervals. */ if (waitfor == MNT_LAZY) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; dsl_pool_t *dp; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (rebooting && spa_suspended(dp->dp_spa)) { zfs_exit(zfsvfs, FTAG); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); zfs_exit(zfsvfs, FTAG); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(8). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == ZFS_XATTR_OFF) { zfsvfs->z_flags &= ~ZSB_XATTR; } else { zfsvfs->z_flags |= ZSB_XATTR; if (newval == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; else zfsvfs->z_xattr_sa = B_FALSE; } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } /* * The nbmand mount option can be changed at mount time. * We can't allow it to be toggled on live file systems or incorrect * behavior may be seen from cifs clients * * This property isn't registered via dsl_prop_register(), but this callback * will be called when a file system is first mounted */ static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); } else { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static void acl_type_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_type = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; boolean_t readonly = B_FALSE; boolean_t do_readonly = B_FALSE; boolean_t setuid = B_FALSE; boolean_t do_setuid = B_FALSE; boolean_t exec = B_FALSE; boolean_t do_exec = B_FALSE; boolean_t xattr = B_FALSE; boolean_t atime = B_FALSE; boolean_t do_atime = B_FALSE; boolean_t do_xattr = B_FALSE; int error = 0; ASSERT3P(vfsp, !=, NULL); zfsvfs = vfsp->vfs_data; ASSERT3P(zfsvfs, !=, NULL); os = zfsvfs->z_os; /* * This function can be called for a snapshot when we update snapshot's * mount point, which isn't really supported. */ if (dmu_objset_is_snapshot(os)) return (EOPNOTSUPP); /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; do_xattr = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { atime = B_FALSE; do_atime = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { atime = B_TRUE; do_atime = B_TRUE; } /* * We need to enter pool configuration here, so that we can use * dsl_prop_get_int_ds() to handle the special nbmand property below. * dsl_prop_get_integer() can not be used, because it has to acquire * spa_namespace_lock and we can not do that because we already hold * z_teardown_lock. The problem is that spa_write_cachefile() is called * with spa_namespace_lock held and the function calls ZFS vnode * operations to write the cache file and thus z_teardown_lock is * acquired after spa_namespace_lock. */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); /* * nbmand is a special property. It can only be changed at * mount time. * * This is weird, but it is documented to only be changeable * at mount time. */ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { nbmand = B_FALSE; } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { nbmand = B_TRUE; } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) { dsl_pool_config_exit(dmu_objset_pool(os), FTAG); return (error); } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); if (do_atime) atime_changed_cb(zfsvfs, atime); nbmand_changed_cb(zfsvfs, nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printf("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val); if (error != 0) return (error); zfsvfs->z_acl_type = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); if (error == 0 && val == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; } error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT3U(zfsvfs->z_root, !=, 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 8, 1, &zfsvfs->z_projectquota_obj); if (error == ENOENT) zfsvfs->z_projectquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 8, 1, &zfsvfs->z_userobjquota_obj); if (error == ENOENT) zfsvfs->z_userobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 8, 1, &zfsvfs->z_groupobjquota_obj); if (error == ENOENT) zfsvfs->z_groupobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 8, 1, &zfsvfs->z_projectobjquota_obj); if (error == ENOENT) zfsvfs->z_projectobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); /* * Only use the name cache if we are looking for a * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name (which is always the case on * FreeBSD). */ zfsvfs->z_use_namecache = !zfsvfs->z_norm || ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); return (0); } taskq_t *zfsvfs_taskq; static void zfsvfs_task_unlinked_drain(void *context, int pending __unused) { zfs_unlinked_drain((zfsvfs_t *)context); } int zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); /* * XXX: Fix struct statfs so this isn't necessary! * * The 'osname' is used as the filesystem's special node, which means * it must fit in statfs.f_mntfromname, or else it can't be * enumerated, so libzfs_mnttab_find() returns NULL, which causes * 'zfs unmount' to think it's not mounted when it is. */ if (strlen(osname) >= MNAMELEN) return (SET_ERROR(ENAMETOOLONG)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); if (error != 0) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } error = zfsvfs_create_impl(zfvp, zfsvfs, os); return (error); } int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) { int error; zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, zfsvfs_task_unlinked_drain, zfsvfs); ZFS_TEARDOWN_INIT(zfsvfs); ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfsvfs_init(zfsvfs, os); if (error != 0) { dmu_objset_disown(os, B_TRUE, zfsvfs); *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; /* * Check for a bad on-disk format version now since we * lied about owning the dataset readonly before. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) return (SET_ERROR(EROFS)); error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { boolean_t readonly; ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); if (error) return (error); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, &zfsvfs->z_kstat.dk_zil_sums); /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; if (readonly != 0) { zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; } else { dsl_dir_t *dd; zap_stats_t zs; if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, &zs) == 0) { dataset_kstats_update_nunlinks_kstat( &zfsvfs->z_kstat, zs.zs_num_entries); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "num_entries in unlinked set: %llu", (u_longlong_t)zs.zs_num_entries); } zfs_unlinked_drain(zfsvfs); dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dd->dd_activity_cancelled = B_FALSE; } /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { boolean_t use_nc = zfsvfs->z_use_namecache; zfsvfs->z_use_namecache = B_FALSE; zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; zfsvfs->z_use_namecache = use_nc; } } /* restore readonly bit */ if (readonly != 0) zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; } else { ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, &zfsvfs->z_kstat.dk_zil_sums); } /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); return (0); } void zfsvfs_free(zfsvfs_t *zfsvfs) { int i; zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); ZFS_TEARDOWN_DESTROY(zfsvfs); ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); dataset_kstats_destroy(&zfsvfs->z_kstat); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int zfs_domount(vfs_t *vfsp, char *osname) { uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; ASSERT3P(vfsp, !=, NULL); ASSERT3P(osname, !=, NULL); error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; if ((error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL))) goto out; zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; /* * This can cause a loss of coherence between ARC and page cache * on ZoF - unclear if the problem is in FreeBSD or ZoF */ vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ vfsp->mnt_kern_flag |= MNTK_NOMSYNC; vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; #if defined(_KERNEL) && !defined(KMEM_DEBUG) vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; #endif /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0); vfsp->vfs_fsid.val[0] = fsid_guid; vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) | (vfsp->mnt_vfc->vfc_typenum & 0xFF); /* * Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if ((error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))) goto out; xattr_changed_cb(zfsvfs, pval); if ((error = dsl_prop_get_integer(osname, "acltype", &pval, NULL))) goto out; acl_type_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) goto out; } vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_inc_32(&zfs_active_fs_count); } return (error); } static void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } static int getpoolname(const char *osname, char *poolname) { char *p; p = strchr(osname, '/'); if (p == NULL) { if (strlen(osname) >= MAXNAMELEN) return (ENAMETOOLONG); (void) strcpy(poolname, osname); } else { if (p - osname >= MAXNAMELEN) return (ENAMETOOLONG); (void) strlcpy(poolname, osname, p - osname + 1); } return (0); } static void fetch_osname_options(char *name, bool *checkpointrewind) { if (name[0] == '!') { *checkpointrewind = true; memmove(name, name + 1, strlen(name)); } else { *checkpointrewind = false; } } static int zfs_mount(vfs_t *vfsp) { kthread_t *td = curthread; vnode_t *mvp = vfsp->mnt_vnodecovered; cred_t *cr = td->td_ucred; char *osname; int error = 0; int canwrite; bool checkpointrewind, isctlsnap = false; if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) return (SET_ERROR(EINVAL)); /* * If full-owner-access is enabled and delegated administration is * turned on, we must set nosuid. */ if (zfs_super_owner && dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { secpolicy_fs_mount_clearopts(cr, vfsp); } fetch_osname_options(osname, &checkpointrewind); isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) && strchr(osname, '@') != NULL); /* * Check for mount privilege? * * If we don't have privilege then see if * we have local permission to allow it */ error = secpolicy_fs_mount(cr, mvp, vfsp); if (error && isctlsnap) { secpolicy_fs_mount_clearopts(cr, vfsp); } else if (error) { if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) goto out; if (!(vfsp->vfs_flag & MS_REMOUNT)) { vattr_t vattr; /* * Make sure user is the owner of the mount point * or has sufficient privileges. */ vattr.va_mask = AT_UID; vn_lock(mvp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(mvp, &vattr, cr)) { VOP_UNLOCK1(mvp); goto out; } if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { VOP_UNLOCK1(mvp); goto out; } VOP_UNLOCK1(mvp); } secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Refuse to mount a filesystem if we are in a local zone and the * dataset is not visible. */ if (!INGLOBALZONE(curproc) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { boolean_t mount_snapshot = B_FALSE; /* * Snapshots may be mounted in .zfs for unjailed datasets * if allowed by the jail param zfs.mount_snapshot. */ if (isctlsnap) { struct prison *pr; struct zfs_jailparam *zjp; pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); zjp = osd_jail_get(pr, zfs_jailparam_slot); mtx_unlock(&pr->pr_mtx); if (zjp && zjp->mount_snapshot) mount_snapshot = B_TRUE; } if (!mount_snapshot) { error = SET_ERROR(EPERM); goto out; } } vfsp->vfs_flag |= MNT_NFS4ACLS; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * Refresh mount options with z_teardown_lock blocking I/O while * the filesystem is in an inconsistent state. * The lock also serializes this code with filesystem * manipulations between entry to zfs_suspend_fs() and return * from zfs_resume_fs(). */ ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); zfs_unregister_callbacks(zfsvfs); error = zfs_register_callbacks(vfsp); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); goto out; } /* Initial root mount: try hard to import the requested root pool. */ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && (vfsp->vfs_flag & MNT_UPDATE) == 0) { char pname[MAXNAMELEN]; error = getpoolname(osname, pname); if (error == 0) error = spa_import_rootpool(pname, checkpointrewind); if (error) goto out; } DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT(); out: return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; int error; statp->f_version = STATFS_VERSION; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = SPA_MINBLOCKSIZE; statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ strlcpy(statp->f_fstypename, "zfs", sizeof (statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof (statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof (statp->f_mntonname)); statp->f_namemax = MAXNAMELEN - 1; zfs_exit(zfsvfs, FTAG); return (0); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); zfs_exit(zfsvfs, FTAG); if (error == 0) { error = vn_lock(*vpp, flags); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; } } return (error); } /* * Teardown the zfsvfs::z_os. * * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; dsl_dir_t *dd; /* * If someone has not already unmounted this file system, * drain the zrele_taskq to ensure all active references to the * zfsvfs_t have been handled only then can it be safely destroyed. */ if (zfsvfs->z_os) { /* * If we're unmounting we have to wait for the list to * drain completely. * * If we're not unmounting there's no guarantee the list * will drain completely, but zreles run from the taskq * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * * We can safely check z_all_znodes for being empty because the * VFS has already blocked operations which add to it. */ int round = 0; while (!list_is_empty(&zfsvfs->z_all_znodes)) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) break; } } ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent * filesystem and all of its snapshots have their vnode's * v_vfsp set to the parent's filesystem's vfsp. Note, * 'z_parent' is self referential for non-snapshots. */ #ifdef FREEBSD_NAMECACHE #if __FreeBSD_version >= 1300117 cache_purgevfs(zfsvfs->z_parent->z_vfs); #else cache_purgevfs(zfsvfs->z_parent->z_vfs, true); #endif #endif } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no vops active, and any new vops will * fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) { if (zp->z_sa_hdl != NULL) { zfs_znode_dmu_fini(zp); } } mutex_exit(&zfsvfs->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new vops * unblock. zfs_inactive will have the unmounted behavior, and all * other vops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data */ if (!zfs_is_readonly(zfsvfs)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); dmu_objset_evict_dbufs(zfsvfs->z_os); dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dsl_dir_cancel_waiters(dd); return (0); } static int zfs_umount(vfs_t *vfsp, int fflag) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; cred_t *cr = td->td_ucred; int ret; ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)vfsp->vfs_resource, ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); } if (fflag & MS_FORCE) { /* * Mark file system as unmounted before calling * vflush(FORCECLOSE). This way we ensure no future vnops * will be called and risk operating on DOOMED vnodes. */ ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); zfsvfs->z_unmounted = B_TRUE; ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* * Flush all the files. */ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) return (ret); while (taskqueue_cancel(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task, NULL) != 0) taskqueue_drain(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task); VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE)); os = zfsvfs->z_os; /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, B_TRUE, zfsvfs); } /* * We can now safely destroy the '.zfs' directory node. */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; /* * zfs_zget() can't operate on virtual entries like .zfs/ or * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. * This will make NFS to switch to LOOKUP instead of using VGET. */ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) return (EOPNOTSUPP); if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { vrele(ZTOV(zp)); err = EINVAL; } if (err == 0) *vpp = ZTOV(zp); zfs_exit(zfsvfs, FTAG); if (err == 0) { err = vn_lock(*vpp, flags); if (err != 0) vrele(*vpp); } if (err != 0) *vpp = NULL; return (err); } static int #if __FreeBSD_version >= 1300098 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, struct ucred **credanonp, int *numsecflavors, int *secflavors) #else zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors) #endif { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * If this is regular file system vfsp is the same as * zfsvfs->z_parent->z_vfs, but if it is snapshot, * zfsvfs->z_parent->z_vfs represents parent file system * which we have to use here, because only this file system * has mnt_export configured. */ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, credanonp, numsecflavors, secflavors)); } _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN, "struct fid bigger than SHORT_FID_LEN"); _Static_assert(sizeof (struct fid) >= LONG_FID_LEN, "struct fid bigger than LONG_FID_LEN"); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) { struct componentname cn; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; vnode_t *dvp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t setgen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); /* * On FreeBSD we can get snapshot's mount point or its parent file * system mount point depending if snapshot is already mounted or not. */ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); zfs_exit(zfsvfs, FTAG); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (SET_ERROR(EINVAL)); if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (fidp->fid_len == LONG_FID_LEN && setgen != 0) { zfs_exit(zfsvfs, FTAG); dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n", (u_longlong_t)fid_gen, (u_longlong_t)setgen); return (SET_ERROR(EINVAL)); } /* * A zero fid_gen means we are in .zfs or the .zfs/snapshot * directory tree. If the object == zfsvfs->z_shares_dir, then * we are in the .zfs/shares directory tree. */ if ((fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { zfs_exit(zfsvfs, FTAG); VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); if (object == ZFSCTL_INO_SNAPDIR) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN | LOCKLEAF; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else if (object == zfsvfs->z_shares_dir) { /* * XXX This branch must not be taken, * if it is, then the lookup below will * explode. */ cn.cn_nameptr = "shares"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else { *vpp = dvp; } return (err); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object, (u_longlong_t)fid_gen, (u_longlong_t)gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { zfs_exit(zfsvfs, FTAG); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%llu) != fid gen (%llu)\n", (u_longlong_t)zp_gen, (u_longlong_t)fid_gen); vrele(ZTOV(zp)); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } *vpp = ZTOV(zp); zfs_exit(zfsvfs, FTAG); err = vn_lock(*vpp, flags); if (err == 0) vnode_create_vobject(*vpp, zp->z_size, curthread); else *vpp = NULL; return (err); } /* * Block out VOPs and close zfsvfs_t::z_os * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err; znode_t *zp; ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); /* * We already own this, so just update the objset_t, as the one we * had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; ds->ds_dir->dd_activity_cancelled = B_FALSE; VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE)); zfs_set_fuid_feature(zfsvfs); /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let * any potential callers discover that via zfs_enter_verify_zp * when they try to use their znode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); bail: /* release the VOPs */ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); if (err) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { vfs_ref(zfsvfs->z_vfs); (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); } } return (err); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; zfsvfs_free(zfsvfs); atomic_dec_32(&zfs_active_fs_count); } #ifdef __i386__ static int desiredvnodes_backup; #include #include #include #include #include #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int newdesiredvnodes; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof (struct vm_object) + sizeof (struct vnode)))); if (newdesiredvnodes == desiredvnodes) desiredvnodes = (3 * newdesiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } +#if __FreeBSD_version >= 1300139 +static struct sx zfs_vnlru_lock; +static struct vnode *zfs_vnlru_marker; +#endif +static arc_prune_t *zfs_prune; + +static void +zfs_prune_task(uint64_t nr_to_scan, void *arg __unused) +{ + if (nr_to_scan > INT_MAX) + nr_to_scan = INT_MAX; +#if __FreeBSD_version >= 1300139 + sx_xlock(&zfs_vnlru_lock); + vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker); + sx_xunlock(&zfs_vnlru_lock); +#else + vnlru_free(nr_to_scan, &zfs_vfsops); +#endif +} + void zfs_init(void) { printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); + +#if __FreeBSD_version >= 1300139 + zfs_vnlru_marker = vnlru_alloc_marker(); + sx_init(&zfs_vnlru_lock, "zfs vnlru lock"); +#endif + zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL); } void zfs_fini(void) { + arc_remove_prune_callback(zfs_prune); +#if __FreeBSD_version >= 1300139 + vnlru_free_marker(zfs_vnlru_marker); + sx_destroy(&zfs_vnlru_lock); +#endif + taskq_destroy(zfsvfs_taskq); zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } /* * Release VOPs and unmount a suspended filesystem. */ int zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); zfsvfs->z_os = os; /* release the VOPs */ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); /* * Try to force unmount this file system. */ (void) zfs_umount(zfsvfs->z_vfs, 0); zfsvfs->z_unmounted = B_TRUE; return (0); } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY0(sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %ju to %ju", (uintmax_t)zfsvfs->z_version, (uintmax_t)newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. */ boolean_t zfs_get_vfs_flag_unmounted(objset_t *os) { zfsvfs_t *zfvp; boolean_t unmounted = B_FALSE; ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); if (zfvp != NULL && zfvp->z_vfs != NULL && (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) unmounted = B_TRUE; mutex_exit(&os->os_user_ptr_lock); return (unmounted); } #ifdef _KERNEL void zfsvfs_update_fromname(const char *oldname, const char *newname) { char tmpbuf[MAXPATHLEN]; struct mount *mp; char *fromname; size_t oldlen; oldlen = strlen(oldname); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { fromname = mp->mnt_stat.f_mntfromname; if (strcmp(fromname, oldname) == 0) { (void) strlcpy(fromname, newname, sizeof (mp->mnt_stat.f_mntfromname)); continue; } if (strncmp(fromname, oldname, oldlen) == 0 && (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", newname, fromname + oldlen); (void) strlcpy(fromname, tmpbuf, sizeof (mp->mnt_stat.f_mntfromname)); continue; } } mtx_unlock(&mountlist_mtx); } #endif /* * Find a prison with ZFS info. * Return the ZFS info and the (locked) prison. */ static struct zfs_jailparam * zfs_jailparam_find(struct prison *spr, struct prison **prp) { struct prison *pr; struct zfs_jailparam *zjp; for (pr = spr; ; pr = pr->pr_parent) { mtx_lock(&pr->pr_mtx); if (pr == &prison0) { zjp = &zfs_jailparam0; break; } zjp = osd_jail_get(pr, zfs_jailparam_slot); if (zjp != NULL) break; mtx_unlock(&pr->pr_mtx); } *prp = pr; return (zjp); } /* * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the * ZFS info and lock the prison. */ static void zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp) { struct prison *ppr; struct zfs_jailparam *zjp, *nzjp; void **rsv; /* If this prison already has ZFS info, return that. */ zjp = zfs_jailparam_find(pr, &ppr); if (ppr == pr) goto done; /* * Allocate a new info record. Then check again, in case something * changed during the allocation. */ mtx_unlock(&ppr->pr_mtx); nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK); rsv = osd_reserve(zfs_jailparam_slot); zjp = zfs_jailparam_find(pr, &ppr); if (ppr == pr) { free(nzjp, M_PRISON); osd_free_reserved(rsv); goto done; } /* Inherit the initial values from the ancestor. */ mtx_lock(&pr->pr_mtx); (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp); (void) memcpy(nzjp, zjp, sizeof (*zjp)); zjp = nzjp; mtx_unlock(&ppr->pr_mtx); done: if (zjpp != NULL) *zjpp = zjp; else mtx_unlock(&pr->pr_mtx); } /* * Jail OSD methods for ZFS VFS info. */ static int zfs_jailparam_create(void *obj, void *data) { struct prison *pr = obj; struct vfsoptlist *opts = data; int jsys; if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 && jsys == JAIL_SYS_INHERIT) return (0); /* * Inherit a prison's initial values from its parent * (different from JAIL_SYS_INHERIT which also inherits changes). */ zfs_jailparam_alloc(pr, NULL); return (0); } static int zfs_jailparam_get(void *obj, void *data) { struct prison *ppr, *pr = obj; struct vfsoptlist *opts = data; struct zfs_jailparam *zjp; int jsys, error; zjp = zfs_jailparam_find(pr, &ppr); jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys)); if (error != 0 && error != ENOENT) goto done; if (jsys == JAIL_SYS_NEW) { error = vfs_setopt(opts, "zfs.mount_snapshot", &zjp->mount_snapshot, sizeof (zjp->mount_snapshot)); if (error != 0 && error != ENOENT) goto done; } else { /* * If this prison is inheriting its ZFS info, report * empty/zero parameters. */ static int mount_snapshot = 0; error = vfs_setopt(opts, "zfs.mount_snapshot", &mount_snapshot, sizeof (mount_snapshot)); if (error != 0 && error != ENOENT) goto done; } error = 0; done: mtx_unlock(&ppr->pr_mtx); return (error); } static int zfs_jailparam_set(void *obj, void *data) { struct prison *pr = obj; struct prison *ppr; struct vfsoptlist *opts = data; int error, jsys, mount_snapshot; /* Set the parameters, which should be correct. */ error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); if (error == ENOENT) jsys = -1; error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, sizeof (mount_snapshot)); if (error == ENOENT) mount_snapshot = -1; else jsys = JAIL_SYS_NEW; switch (jsys) { case JAIL_SYS_NEW: { /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */ struct zfs_jailparam *zjp; /* * A child jail cannot have more permissions than its parent */ if (pr->pr_parent != &prison0) { zjp = zfs_jailparam_find(pr->pr_parent, &ppr); mtx_unlock(&ppr->pr_mtx); if (zjp->mount_snapshot < mount_snapshot) { return (EPERM); } } zfs_jailparam_alloc(pr, &zjp); if (mount_snapshot != -1) zjp->mount_snapshot = mount_snapshot; mtx_unlock(&pr->pr_mtx); break; } case JAIL_SYS_INHERIT: /* "zfs=inherit": inherit the parent's ZFS info. */ mtx_lock(&pr->pr_mtx); osd_jail_del(pr, zfs_jailparam_slot); mtx_unlock(&pr->pr_mtx); break; case -1: /* * If the setting being changed is not ZFS related * then do nothing. */ break; } return (0); } static int zfs_jailparam_check(void *obj __unused, void *data) { struct vfsoptlist *opts = data; int error, jsys, mount_snapshot; /* Check that the parameters are correct. */ error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); if (error != ENOENT) { if (error != 0) return (error); if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT) return (EINVAL); } error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, sizeof (mount_snapshot)); if (error != ENOENT) { if (error != 0) return (error); if (mount_snapshot != 0 && mount_snapshot != 1) return (EINVAL); } return (0); } static void zfs_jailparam_destroy(void *data) { free(data, M_PRISON); } static void zfs_jailparam_sysinit(void *arg __unused) { struct prison *pr; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_CREATE] = zfs_jailparam_create, [PR_METHOD_GET] = zfs_jailparam_get, [PR_METHOD_SET] = zfs_jailparam_set, [PR_METHOD_CHECK] = zfs_jailparam_check, }; zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods); /* Copy the defaults to any existing prisons. */ sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) zfs_jailparam_alloc(pr, NULL); sx_sunlock(&allprison_lock); } static void zfs_jailparam_sysuninit(void *arg __unused) { osd_jail_deregister(zfs_jailparam_slot); } SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zfs_jailparam_sysinit, NULL); SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zfs_jailparam_sysuninit, NULL); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c index 29a8802b8367..43ed087e2dbb 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c @@ -1,544 +1,493 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #include #endif #include #include #include #include #include #include #include /* * This is a limit on how many pages the ARC shrinker makes available for * eviction in response to one page allocation attempt. Note that in * practice, the kernel's shrinker can ask us to evict up to about 4x this * for one allocation attempt. * * The default limit of 10,000 (in practice, 160MB per allocation attempt * with 4K pages) limits the amount of time spent attempting to reclaim ARC * memory to less than 100ms per allocation attempt, even with a small * average compressed block size of ~8KB. * * See also the comment in arc_shrinker_count(). * Set to 0 to disable limit. */ int zfs_arc_shrinker_limit = 10000; #ifdef CONFIG_MEMORY_HOTPLUG static struct notifier_block arc_hotplug_callback_mem_nb; #endif /* * Return a default max arc size based on the amount of physical memory. */ uint64_t arc_default_max(uint64_t min, uint64_t allmem) { /* Default to 1/2 of all memory. */ return (MAX(allmem / 2, min)); } #ifdef _KERNEL /* * Return maximum amount of memory that we could possibly use. Reduced * to half of all memory in user space which is primarily used for testing. */ uint64_t arc_all_memory(void) { #ifdef CONFIG_HIGHMEM return (ptob(zfs_totalram_pages - zfs_totalhigh_pages)); #else return (ptob(zfs_totalram_pages)); #endif /* CONFIG_HIGHMEM */ } /* * Return the amount of memory that is considered free. In user space * which is primarily used for testing we pretend that free memory ranges * from 0-20% of all memory. */ uint64_t arc_free_memory(void) { #ifdef CONFIG_HIGHMEM struct sysinfo si; si_meminfo(&si); return (ptob(si.freeram - si.freehigh)); #else return (ptob(nr_free_pages() + nr_inactive_file_pages())); #endif /* CONFIG_HIGHMEM */ } /* * Return the amount of memory that can be consumed before reclaim will be * needed. Positive if there is sufficient free memory, negative indicates * the amount of memory that needs to be freed up. */ int64_t arc_available_memory(void) { return (arc_free_memory() - arc_sys_free); } static uint64_t arc_evictable_memory(void) { int64_t asize = aggsum_value(&arc_sums.arcstat_size); uint64_t arc_clean = zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0); /* * Scale reported evictable memory in proportion to page cache, cap * at specified min/max. */ uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent; min = MAX(arc_c_min, MIN(arc_c_max, min)); if (arc_dirty >= min) return (arc_clean); return (MAX((int64_t)asize - (int64_t)min, 0)); } /* * The _count() function returns the number of free-able objects. * The _scan() function returns the number of objects that were freed. */ static unsigned long arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { /* * __GFP_FS won't be set if we are called from ZFS code (see * kmem_flags_convert(), which removes it). To avoid a deadlock, we * don't allow evicting in this case. We return 0 rather than * SHRINK_STOP so that the shrinker logic doesn't accumulate a * deficit against us. */ if (!(sc->gfp_mask & __GFP_FS)) { return (0); } /* * This code is reached in the "direct reclaim" case, where the * kernel (outside ZFS) is trying to allocate a page, and the system * is low on memory. * * The kernel's shrinker code doesn't understand how many pages the * ARC's callback actually frees, so it may ask the ARC to shrink a * lot for one page allocation. This is problematic because it may * take a long time, thus delaying the page allocation, and because * it may force the ARC to unnecessarily shrink very small. * * Therefore, we limit the amount of data that we say is evictable, * which limits the amount that the shrinker will ask us to evict for * one page allocation attempt. * * In practice, we may be asked to shrink 4x the limit to satisfy one * page allocation, before the kernel's shrinker code gives up on us. * When that happens, we rely on the kernel code to find the pages * that we freed before invoking the OOM killer. This happens in * __alloc_pages_slowpath(), which retries and finds the pages we * freed when it calls get_page_from_freelist(). * * See also the comment above zfs_arc_shrinker_limit. */ int64_t limit = zfs_arc_shrinker_limit != 0 ? zfs_arc_shrinker_limit : INT64_MAX; return (MIN(limit, btop((int64_t)arc_evictable_memory()))); } static unsigned long arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { ASSERT((sc->gfp_mask & __GFP_FS) != 0); /* The arc is considered warm once reclaim has occurred */ if (unlikely(arc_warm == B_FALSE)) arc_warm = B_TRUE; /* * Evict the requested number of pages by reducing arc_c and waiting * for the requested amount of data to be evicted. */ arc_reduce_target_size(ptob(sc->nr_to_scan)); arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); if (current->reclaim_state != NULL) #ifdef HAVE_RECLAIM_STATE_RECLAIMED current->reclaim_state->reclaimed += sc->nr_to_scan; #else current->reclaim_state->reclaimed_slab += sc->nr_to_scan; #endif /* * We are experiencing memory pressure which the arc_evict_zthr was * unable to keep up with. Set arc_no_grow to briefly pause arc * growth to avoid compounding the memory pressure. */ arc_no_grow = B_TRUE; /* * When direct reclaim is observed it usually indicates a rapid * increase in memory pressure. This occurs because the kswapd * threads were unable to asynchronously keep enough free memory * available. */ if (current_is_kswapd()) { ARCSTAT_BUMP(arcstat_memory_indirect_count); } else { ARCSTAT_BUMP(arcstat_memory_direct_count); } return (sc->nr_to_scan); } SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) { uint64_t free_memory = arc_free_memory(); if (free_memory > arc_all_memory() * arc_lotsfree_percent / 100) return (0); if (txg > spa->spa_lowmem_last_txg) { spa->spa_lowmem_last_txg = txg; spa->spa_lowmem_page_load = 0; } /* * If we are in pageout, we know that memory is already tight, * the arc is already going to be evicting, so we just want to * continue to let page writes occur as quickly as possible. */ if (current_is_kswapd()) { if (spa->spa_lowmem_page_load > MAX(arc_sys_free / 4, free_memory) / 4) { DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); return (SET_ERROR(ERESTART)); } /* Note: reserve is inflated, so we deflate */ atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); return (0); } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); return (SET_ERROR(EAGAIN)); } spa->spa_lowmem_page_load = 0; return (0); } static void arc_set_sys_free(uint64_t allmem) { /* * The ARC tries to keep at least this much memory available for the * system. This gives the ARC time to shrink in response to memory * pressure, before running completely out of memory and invoking the * direct-reclaim ARC shrinker. * * This should be more than twice high_wmark_pages(), so that * arc_wait_for_eviction() will wait until at least the * high_wmark_pages() are free (see arc_evict_state_impl()). * * Note: Even when the system is very low on memory, the kernel's * shrinker code may only ask for one "batch" of pages (512KB) to be * evicted. If concurrent allocations consume these pages, there may * still be insufficient free pages, and the OOM killer takes action. * * By setting arc_sys_free large enough, and having * arc_wait_for_eviction() wait until there is at least arc_sys_free/2 * free memory, it is much less likely that concurrent allocations can * consume all the memory that was evicted before checking for * OOM. * * It's hard to iterate the zones from a linux kernel module, which * makes it difficult to determine the watermark dynamically. Instead * we compute the maximum high watermark for this system, based * on the amount of memory, assuming default parameters on Linux kernel * 5.3. */ /* * Base wmark_low is 4 * the square root of Kbytes of RAM. */ long wmark = 4 * int_sqrt(allmem/1024) * 1024; /* * Clamp to between 128K and 64MB. */ wmark = MAX(wmark, 128 * 1024); wmark = MIN(wmark, 64 * 1024 * 1024); /* * watermark_boost can increase the wmark by up to 150%. */ wmark += wmark * 150 / 100; /* * arc_sys_free needs to be more than 2x the watermark, because * arc_wait_for_eviction() waits for half of arc_sys_free. Bump this up * to 3x to ensure we're above it. */ arc_sys_free = wmark * 3 + allmem / 32; } void arc_lowmem_init(void) { uint64_t allmem = arc_all_memory(); /* * Register a shrinker to support synchronous (direct) memory * reclaim from the arc. This is done to prevent kswapd from * swapping out pages when it is preferable to shrink the arc. */ spl_register_shrinker(&arc_shrinker); arc_set_sys_free(allmem); } void arc_lowmem_fini(void) { spl_unregister_shrinker(&arc_shrinker); } int param_set_arc_u64(const char *buf, zfs_kernel_param_t *kp) { int error; error = spl_param_set_u64(buf, kp); if (error < 0) return (SET_ERROR(error)); arc_tuning_update(B_TRUE); return (0); } int param_set_arc_min(const char *buf, zfs_kernel_param_t *kp) { return (param_set_arc_u64(buf, kp)); } int param_set_arc_max(const char *buf, zfs_kernel_param_t *kp) { return (param_set_arc_u64(buf, kp)); } int param_set_arc_int(const char *buf, zfs_kernel_param_t *kp) { int error; error = param_set_int(buf, kp); if (error < 0) return (SET_ERROR(error)); arc_tuning_update(B_TRUE); return (0); } #ifdef CONFIG_MEMORY_HOTPLUG static int arc_hotplug_callback(struct notifier_block *self, unsigned long action, void *arg) { (void) self, (void) arg; uint64_t allmem = arc_all_memory(); if (action != MEM_ONLINE) return (NOTIFY_OK); arc_set_limits(allmem); #ifdef __LP64__ if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #else if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #endif arc_set_sys_free(allmem); return (NOTIFY_OK); } #endif void arc_register_hotplug(void) { #ifdef CONFIG_MEMORY_HOTPLUG arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback; /* There is no significance to the value 100 */ arc_hotplug_callback_mem_nb.priority = 100; register_memory_notifier(&arc_hotplug_callback_mem_nb); #endif } void arc_unregister_hotplug(void) { #ifdef CONFIG_MEMORY_HOTPLUG unregister_memory_notifier(&arc_hotplug_callback_mem_nb); #endif } #else /* _KERNEL */ int64_t arc_available_memory(void) { int64_t lowest = INT64_MAX; /* Every 100 calls, free a small amount */ if (random_in_range(100) == 0) lowest = -1024; return (lowest); } int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) { (void) spa, (void) reserve, (void) txg; return (0); } uint64_t arc_all_memory(void) { return (ptob(physmem) / 2); } uint64_t arc_free_memory(void) { return (random_in_range(arc_all_memory() * 20 / 100)); } void arc_register_hotplug(void) { } void arc_unregister_hotplug(void) { } #endif /* _KERNEL */ -/* - * Helper function for arc_prune_async() it is responsible for safely - * handling the execution of a registered arc_prune_func_t. - */ -static void -arc_prune_task(void *ptr) -{ - arc_prune_t *ap = (arc_prune_t *)ptr; - arc_prune_func_t *func = ap->p_pfunc; - - if (func != NULL) - func(ap->p_adjust, ap->p_private); - - zfs_refcount_remove(&ap->p_refcnt, func); -} - -/* - * Notify registered consumers they must drop holds on a portion of the ARC - * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the metadata limit and reclaim otherwise pinned ARC buffers. This - * is analogous to dnlc_reduce_cache() but more generic. - * - * This operation is performed asynchronously so it may be safely called - * in the context of the arc_reclaim_thread(). A reference is taken here - * for each registered arc_prune_t and the arc_prune_task() is responsible - * for releasing it once the registered arc_prune_func_t has completed. - */ -void -arc_prune_async(uint64_t adjust) -{ - arc_prune_t *ap; - - mutex_enter(&arc_prune_mtx); - for (ap = list_head(&arc_prune_list); ap != NULL; - ap = list_next(&arc_prune_list, ap)) { - - if (zfs_refcount_count(&ap->p_refcnt) >= 2) - continue; - - zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); - ap->p_adjust = adjust; - if (taskq_dispatch(arc_prune_taskq, arc_prune_task, - ap, TQ_SLEEP) == TASKQID_INVALID) { - zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); - continue; - } - ARCSTAT_BUMP(arcstat_prune); - } - mutex_exit(&arc_prune_mtx); -} - ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, "Limit on number of pages that ARC shrinker can reclaim at once"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index 02cb379ea840..94e25fa0ae8f 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -1,1317 +1,1317 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (C) 2011 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * LLNL-CODE-403049. * Rewritten for Linux by: * Rohan Puri * Brian Behlendorf * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2018 George Melikov. All Rights Reserved. * Copyright (c) 2019 Datto, Inc. All rights reserved. * Copyright (c) 2020 The MathWorks, Inc. All rights reserved. */ /* * ZFS control directory (a.k.a. ".zfs") * * This directory provides a common location for all ZFS meta-objects. * Currently, this is only the 'snapshot' and 'shares' directory, but this may * expand in the future. The elements are built dynamically, as the hierarchy * does not actually exist on disk. * * For 'snapshot', we don't want to have all snapshots always mounted, because * this would take up a huge amount of space in /etc/mnttab. We have three * types of objects: * * ctldir ------> snapshotdir -------> snapshot * | * | * V * mounted fs * * The 'snapshot' node contains just enough information to lookup '..' and act * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we * perform an automount of the underlying filesystem and return the * corresponding inode. * * All mounts are handled automatically by an user mode helper which invokes * the mount procedure. Unmounts are handled by allowing the mount * point to expire so the kernel may automatically unmount it. * * The '.zfs', '.zfs/snapshot', and all directories created under * '.zfs/snapshot' (ie: '.zfs/snapshot/') all share the same * zfsvfs_t as the head filesystem (what '.zfs' lives under). * * File systems mounted on top of the '.zfs/snapshot/' paths * (ie: snapshots) are complete ZFS filesystems and have their own unique * zfsvfs_t. However, the fsid reported by these mounts will be the same * as that used by the parent zfsvfs_t to make NFS happy. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" /* * Two AVL trees are maintained which contain all currently automounted * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t * entry which MUST: * * - be attached to both trees, and * - be unique, no duplicate entries are allowed. * * The zfs_snapshots_by_name tree is indexed by the full dataset name * while the zfs_snapshots_by_objsetid tree is indexed by the unique * objsetid. This allows for fast lookups either by name or objsetid. */ static avl_tree_t zfs_snapshots_by_name; static avl_tree_t zfs_snapshots_by_objsetid; static krwlock_t zfs_snapshot_lock; /* * Control Directory Tunables (.zfs) */ int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; static int zfs_admin_snapshot = 0; typedef struct { char *se_name; /* full snapshot name */ char *se_path; /* full mount path */ spa_t *se_spa; /* pool spa */ uint64_t se_objsetid; /* snapshot objset id */ struct dentry *se_root_dentry; /* snapshot root dentry */ krwlock_t se_taskqid_lock; /* scheduled unmount taskqid lock */ taskqid_t se_taskqid; /* scheduled unmount taskqid */ avl_node_t se_node_name; /* zfs_snapshots_by_name link */ avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */ zfs_refcount_t se_refcount; /* reference count */ } zfs_snapentry_t; static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay); /* * Allocate a new zfs_snapentry_t being careful to make a copy of the * the snapshot name and provided mount point. No reference is taken. */ static zfs_snapentry_t * zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa, uint64_t objsetid, struct dentry *root_dentry) { zfs_snapentry_t *se; se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP); se->se_name = kmem_strdup(full_name); se->se_path = kmem_strdup(full_path); se->se_spa = spa; se->se_objsetid = objsetid; se->se_root_dentry = root_dentry; se->se_taskqid = TASKQID_INVALID; rw_init(&se->se_taskqid_lock, NULL, RW_DEFAULT, NULL); zfs_refcount_create(&se->se_refcount); return (se); } /* * Free a zfs_snapentry_t the caller must ensure there are no active * references. */ static void zfsctl_snapshot_free(zfs_snapentry_t *se) { zfs_refcount_destroy(&se->se_refcount); kmem_strfree(se->se_name); kmem_strfree(se->se_path); rw_destroy(&se->se_taskqid_lock); kmem_free(se, sizeof (zfs_snapentry_t)); } /* * Hold a reference on the zfs_snapentry_t. */ static void zfsctl_snapshot_hold(zfs_snapentry_t *se) { zfs_refcount_add(&se->se_refcount, NULL); } /* * Release a reference on the zfs_snapentry_t. When the number of * references drops to zero the structure will be freed. */ static void zfsctl_snapshot_rele(zfs_snapentry_t *se) { if (zfs_refcount_remove(&se->se_refcount, NULL) == 0) zfsctl_snapshot_free(se); } /* * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and * zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part * of the trees a reference is held. */ static void zfsctl_snapshot_add(zfs_snapentry_t *se) { ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); zfsctl_snapshot_hold(se); avl_add(&zfs_snapshots_by_name, se); avl_add(&zfs_snapshots_by_objsetid, se); } /* * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and * zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped, * this can result in the structure being freed if that was the last * remaining reference. */ static void zfsctl_snapshot_remove(zfs_snapentry_t *se) { ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); avl_remove(&zfs_snapshots_by_name, se); avl_remove(&zfs_snapshots_by_objsetid, se); zfsctl_snapshot_rele(se); } /* * Snapshot name comparison function for the zfs_snapshots_by_name. */ static int snapentry_compare_by_name(const void *a, const void *b) { const zfs_snapentry_t *se_a = a; const zfs_snapentry_t *se_b = b; int ret; ret = strcmp(se_a->se_name, se_b->se_name); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } /* * Snapshot name comparison function for the zfs_snapshots_by_objsetid. */ static int snapentry_compare_by_objsetid(const void *a, const void *b) { const zfs_snapentry_t *se_a = a; const zfs_snapentry_t *se_b = b; if (se_a->se_spa != se_b->se_spa) return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1); if (se_a->se_objsetid < se_b->se_objsetid) return (-1); else if (se_a->se_objsetid > se_b->se_objsetid) return (1); else return (0); } /* * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname * is found a pointer to the zfs_snapentry_t is returned and a reference * taken on the structure. The caller is responsible for dropping the * reference with zfsctl_snapshot_rele(). If the snapname is not found * NULL will be returned. */ static zfs_snapentry_t * zfsctl_snapshot_find_by_name(const char *snapname) { zfs_snapentry_t *se, search; ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); search.se_name = (char *)snapname; se = avl_find(&zfs_snapshots_by_name, &search, NULL); if (se) zfsctl_snapshot_hold(se); return (se); } /* * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id * rather than the snapname. In all other respects it behaves the same * as zfsctl_snapshot_find_by_name(). */ static zfs_snapentry_t * zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid) { zfs_snapentry_t *se, search; ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); search.se_spa = spa; search.se_objsetid = objsetid; se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL); if (se) zfsctl_snapshot_hold(se); return (se); } /* * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is * removed, renamed, and added back to the new correct location in the tree. */ static int zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname) { zfs_snapentry_t *se; ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); se = zfsctl_snapshot_find_by_name(old_snapname); if (se == NULL) return (SET_ERROR(ENOENT)); zfsctl_snapshot_remove(se); kmem_strfree(se->se_name); se->se_name = kmem_strdup(new_snapname); zfsctl_snapshot_add(se); zfsctl_snapshot_rele(se); return (0); } /* * Delayed task responsible for unmounting an expired automounted snapshot. */ static void snapentry_expire(void *data) { zfs_snapentry_t *se = (zfs_snapentry_t *)data; spa_t *spa = se->se_spa; uint64_t objsetid = se->se_objsetid; if (zfs_expire_snapshot <= 0) { zfsctl_snapshot_rele(se); return; } rw_enter(&se->se_taskqid_lock, RW_WRITER); se->se_taskqid = TASKQID_INVALID; rw_exit(&se->se_taskqid_lock); (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE); zfsctl_snapshot_rele(se); /* * Reschedule the unmount if the zfs_snapentry_t wasn't removed. * This can occur when the snapshot is busy. */ rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); zfsctl_snapshot_rele(se); } rw_exit(&zfs_snapshot_lock); } /* * Cancel an automatic unmount of a snapname. This callback is responsible * for dropping the reference on the zfs_snapentry_t which was taken when * during dispatch. */ static void zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) { int err = 0; rw_enter(&se->se_taskqid_lock, RW_WRITER); err = taskq_cancel_id(system_delay_taskq, se->se_taskqid); /* * if we get ENOENT, the taskq couldn't be found to be * canceled, so we can just mark it as invalid because * it's already gone. If we got EBUSY, then we already * blocked until it was gone _anyway_, so we don't care. */ se->se_taskqid = TASKQID_INVALID; rw_exit(&se->se_taskqid_lock); if (err == 0) { zfsctl_snapshot_rele(se); } } /* * Dispatch the unmount task for delayed handling with a hold protecting it. */ static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) { if (delay <= 0) return; zfsctl_snapshot_hold(se); rw_enter(&se->se_taskqid_lock, RW_WRITER); /* * If this condition happens, we managed to: * - dispatch once * - want to dispatch _again_ before it returned * * So let's just return - if that task fails at unmounting, * we'll eventually dispatch again, and if it succeeds, * no problem. */ if (se->se_taskqid != TASKQID_INVALID) { rw_exit(&se->se_taskqid_lock); zfsctl_snapshot_rele(se); return; } se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); rw_exit(&se->se_taskqid_lock); } /* * Schedule an automatic unmount of objset id to occur in delay seconds from * now. Any previous delayed unmount will be cancelled in favor of the * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name() * and held until the outstanding task is handled or cancelled. */ int zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay) { zfs_snapentry_t *se; int error = ENOENT; rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { zfsctl_snapshot_unmount_cancel(se); zfsctl_snapshot_unmount_delay_impl(se, delay); zfsctl_snapshot_rele(se); error = 0; } rw_exit(&zfs_snapshot_lock); return (error); } /* * Check if snapname is currently mounted. Returned non-zero when mounted * and zero when unmounted. */ static boolean_t zfsctl_snapshot_ismounted(const char *snapname) { zfs_snapentry_t *se; boolean_t ismounted = B_FALSE; rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) { zfsctl_snapshot_rele(se); ismounted = B_TRUE; } rw_exit(&zfs_snapshot_lock); return (ismounted); } /* * Check if the given inode is a part of the virtual .zfs directory. */ boolean_t zfsctl_is_node(struct inode *ip) { return (ITOZ(ip)->z_is_ctldir); } /* * Check if the given inode is a .zfs/snapshots/snapname directory. */ boolean_t zfsctl_is_snapdir(struct inode *ip) { return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS)); } /* * Allocate a new inode with the passed id and ops. */ static struct inode * zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, const struct file_operations *fops, const struct inode_operations *ops, uint64_t creation) { struct inode *ip; znode_t *zp; inode_timespec_t now = {.tv_sec = creation}; ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); if (!creation) now = current_time(ip); zp = ITOZ(ip); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_id = id; zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; zp->z_zn_prefetch = B_FALSE; zp->z_is_sa = B_FALSE; #if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) zp->z_is_mapped = B_FALSE; #endif zp->z_is_ctldir = B_TRUE; zp->z_sa_hdl = NULL; zp->z_blksz = 0; zp->z_seq = 0; zp->z_mapcnt = 0; zp->z_size = 0; zp->z_pflags = 0; zp->z_mode = 0; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; ip->i_generation = 0; ip->i_ino = id; ip->i_mode = (S_IFDIR | S_IRWXUGO); ip->i_uid = SUID_TO_KUID(0); ip->i_gid = SGID_TO_KGID(0); ip->i_blkbits = SPA_MINBLOCKSHIFT; ip->i_atime = now; ip->i_mtime = now; - ip->i_ctime = now; + zpl_inode_set_ctime_to_ts(ip, now); ip->i_fop = fops; ip->i_op = ops; #if defined(IOP_XATTR) ip->i_opflags &= ~IOP_XATTR; #endif if (insert_inode_locked(ip)) { unlock_new_inode(ip); iput(ip); return (NULL); } mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); membar_producer(); mutex_exit(&zfsvfs->z_znodes_lock); unlock_new_inode(ip); return (ip); } /* * Lookup the inode with given id, it will be allocated if needed. */ static struct inode * zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, const struct file_operations *fops, const struct inode_operations *ops) { struct inode *ip = NULL; uint64_t creation = 0; dsl_dataset_t *snap_ds; dsl_pool_t *pool; while (ip == NULL) { ip = ilookup(zfsvfs->z_sb, (unsigned long)id); if (ip) break; if (id <= ZFSCTL_INO_SNAPDIRS && !creation) { pool = dmu_objset_pool(zfsvfs->z_os); dsl_pool_config_enter(pool, FTAG); if (!dsl_dataset_hold_obj(pool, ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) { creation = dsl_get_creation(snap_ds); dsl_dataset_rele(snap_ds, FTAG); } dsl_pool_config_exit(pool, FTAG); } /* May fail due to concurrent zfsctl_inode_alloc() */ ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation); } return (ip); } /* * Create the '.zfs' directory. This directory is cached as part of the VFS * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount() * therefore checks against a vfs_count of 2 instead of 1. This reference * is removed when the ctldir is destroyed in the unmount. All other entities * under the '.zfs' directory are created dynamically as needed. * * Because the dynamically created '.zfs' directory entries assume the use * of 64-bit inode numbers this support must be disabled on 32-bit systems. */ int zfsctl_create(zfsvfs_t *zfsvfs) { ASSERT(zfsvfs->z_ctldir == NULL); zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, &zpl_fops_root, &zpl_ops_root, 0); if (zfsvfs->z_ctldir == NULL) return (SET_ERROR(ENOENT)); return (0); } /* * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name. * Only called when the filesystem is unmounted. */ void zfsctl_destroy(zfsvfs_t *zfsvfs) { if (zfsvfs->z_issnap) { zfs_snapentry_t *se; spa_t *spa = zfsvfs->z_os->os_spa; uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); rw_enter(&zfs_snapshot_lock, RW_WRITER); se = zfsctl_snapshot_find_by_objsetid(spa, objsetid); if (se != NULL) zfsctl_snapshot_remove(se); rw_exit(&zfs_snapshot_lock); if (se != NULL) { zfsctl_snapshot_unmount_cancel(se); zfsctl_snapshot_rele(se); } } else if (zfsvfs->z_ctldir) { iput(zfsvfs->z_ctldir); zfsvfs->z_ctldir = NULL; } } /* * Given a root znode, retrieve the associated .zfs directory. * Add a hold to the vnode and return it. */ struct inode * zfsctl_root(znode_t *zp) { ASSERT(zfs_has_ctldir(zp)); /* Must have an existing ref, so igrab() cannot return NULL */ VERIFY3P(igrab(ZTOZSB(zp)->z_ctldir), !=, NULL); return (ZTOZSB(zp)->z_ctldir); } /* * Generate a long fid to indicate a snapdir. We encode whether snapdir is * already mounted in gen field. We do this because nfsd lookup will not * trigger automount. Next time the nfsd does fh_to_dentry, we will notice * this and do automount and return ESTALE to force nfsd revalidate and follow * mount. */ static int zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp) { zfid_short_t *zfid = (zfid_short_t *)fidp; zfid_long_t *zlfid = (zfid_long_t *)fidp; uint32_t gen = 0; uint64_t object; uint64_t objsetid; int i; struct dentry *dentry; if (fidp->fid_len < LONG_FID_LEN) { fidp->fid_len = LONG_FID_LEN; return (SET_ERROR(ENOSPC)); } object = ip->i_ino; objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino; zfid->zf_len = LONG_FID_LEN; dentry = d_obtain_alias(igrab(ip)); if (!IS_ERR(dentry)) { gen = !!d_mountpoint(dentry); dput(dentry); } for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; return (0); } /* * Generate an appropriate fid for an entry in the .zfs directory. */ int zfsctl_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t object = zp->z_id; zfid_short_t *zfid; int i; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (zfsctl_is_snapdir(ip)) { zfs_exit(zfsvfs, FTAG); return (zfsctl_snapdir_fid(ip, fidp)); } if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOSPC)); } zfid = (zfid_short_t *)fidp; zfid->zf_len = SHORT_FID_LEN; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* .zfs znodes always have a generation number of 0 */ for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = 0; zfs_exit(zfsvfs, FTAG); return (0); } /* * Construct a full dataset name in full_name: "pool/dataset@snap_name" */ static int zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len, char *full_name) { objset_t *os = zfsvfs->z_os; if (zfs_component_namecheck(snap_name, NULL, NULL) != 0) return (SET_ERROR(EILSEQ)); dmu_objset_name(os, full_name); if ((strlen(full_name) + 1 + strlen(snap_name)) >= len) return (SET_ERROR(ENAMETOOLONG)); (void) strcat(full_name, "@"); (void) strcat(full_name, snap_name); return (0); } /* * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" */ static int zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, int path_len, char *full_path) { objset_t *os = zfsvfs->z_os; fstrans_cookie_t cookie; char *snapname; boolean_t case_conflict; uint64_t id, pos = 0; int error = 0; if (zfsvfs->z_vfs->vfs_mntpoint == NULL) return (SET_ERROR(ENOENT)); cookie = spl_fstrans_mark(); snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); while (error == 0) { dsl_pool_config_enter(dmu_objset_pool(os), FTAG); error = dmu_snapshot_list_next(zfsvfs->z_os, ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos, &case_conflict); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto out; if (id == objsetid) break; } snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", zfsvfs->z_vfs->vfs_mntpoint, snapname); out: kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); spl_fstrans_unmark(cookie); return (error); } /* * Special case the handling of "..". */ int zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ITOZSB(dip); int error = 0; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (strcmp(name, "..") == 0) { *ipp = dip->i_sb->s_root->d_inode; } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) { *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR, &zpl_fops_snapdir, &zpl_ops_snapdir); } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) { *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES, &zpl_fops_shares, &zpl_ops_shares); } else { *ipp = NULL; } if (*ipp == NULL) error = SET_ERROR(ENOENT); zfs_exit(zfsvfs, FTAG); return (error); } /* * Lookup entry point for the 'snapshot' directory. Try to open the * snapshot if it exist, creating the pseudo filesystem inode as necessary. */ int zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ITOZSB(dip); uint64_t id; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id); if (error) { zfs_exit(zfsvfs, FTAG); return (error); } *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id, &simple_dir_operations, &simple_dir_inode_operations); if (*ipp == NULL) error = SET_ERROR(ENOENT); zfs_exit(zfsvfs, FTAG); return (error); } /* * Renaming a directory under '.zfs/snapshot' will automatically trigger * a rename of the snapshot to the new given name. The rename is confined * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere. */ int zfsctl_snapdir_rename(struct inode *sdip, const char *snm, struct inode *tdip, const char *tnm, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = ITOZSB(sdip); char *to, *from, *real, *fsname; int error; if (!zfs_admin_snapshot) return (SET_ERROR(EACCES)); if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { error = dmu_snapshot_realname(zfsvfs->z_os, snm, real, ZFS_MAX_DATASET_NAME_LEN, NULL); if (error == 0) { snm = real; } else if (error != ENOTSUP) { goto out; } } dmu_objset_name(zfsvfs->z_os, fsname); error = zfsctl_snapshot_name(ITOZSB(sdip), snm, ZFS_MAX_DATASET_NAME_LEN, from); if (error == 0) error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, ZFS_MAX_DATASET_NAME_LEN, to); if (error == 0) error = zfs_secpolicy_rename_perms(from, to, cr); if (error != 0) goto out; /* * Cannot move snapshots out of the snapdir. */ if (sdip != tdip) { error = SET_ERROR(EINVAL); goto out; } /* * No-op when names are identical. */ if (strcmp(snm, tnm) == 0) { error = 0; goto out; } rw_enter(&zfs_snapshot_lock, RW_WRITER); error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); if (error == 0) (void) zfsctl_snapshot_rename(snm, tnm); rw_exit(&zfs_snapshot_lock); out: kmem_free(from, ZFS_MAX_DATASET_NAME_LEN); kmem_free(to, ZFS_MAX_DATASET_NAME_LEN); kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); zfs_exit(zfsvfs, FTAG); return (error); } /* * Removing a directory under '.zfs/snapshot' will automatically trigger * the removal of the snapshot with the given name. */ int zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = ITOZSB(dip); char *snapname, *real; int error; if (!zfs_admin_snapshot) return (SET_ERROR(EACCES)); if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { error = dmu_snapshot_realname(zfsvfs->z_os, name, real, ZFS_MAX_DATASET_NAME_LEN, NULL); if (error == 0) { name = real; } else if (error != ENOTSUP) { goto out; } } error = zfsctl_snapshot_name(ITOZSB(dip), name, ZFS_MAX_DATASET_NAME_LEN, snapname); if (error == 0) error = zfs_secpolicy_destroy_perms(snapname, cr); if (error != 0) goto out; error = zfsctl_snapshot_unmount(snapname, MNT_FORCE); if ((error == 0) || (error == ENOENT)) error = dsl_destroy_snapshot(snapname, B_FALSE); out: kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); zfs_exit(zfsvfs, FTAG); return (error); } /* * Creating a directory under '.zfs/snapshot' will automatically trigger * the creation of a new snapshot with the given name. */ int zfsctl_snapdir_mkdir(struct inode *dip, const char *dirname, vattr_t *vap, struct inode **ipp, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = ITOZSB(dip); char *dsname; int error; if (!zfs_admin_snapshot) return (SET_ERROR(EACCES)); dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zfs_component_namecheck(dirname, NULL, NULL) != 0) { error = SET_ERROR(EILSEQ); goto out; } dmu_objset_name(zfsvfs->z_os, dsname); error = zfs_secpolicy_snapshot_perms(dsname, cr); if (error != 0) goto out; if (error == 0) { error = dmu_objset_snapshot_one(dsname, dirname); if (error != 0) goto out; error = zfsctl_snapdir_lookup(dip, dirname, ipp, 0, cr, NULL, NULL); } out: kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); return (error); } /* * Flush everything out of the kernel's export table and such. * This is needed as once the snapshot is used over NFS, its * entries in svc_export and svc_expkey caches hold reference * to the snapshot mount point. There is no known way of flushing * only the entries related to the snapshot. */ static void exportfs_flush(void) { char *argv[] = { "/usr/sbin/exportfs", "-f", NULL }; char *envp[] = { NULL }; (void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); } /* * Attempt to unmount a snapshot by making a call to user space. * There is no assurance that this can or will succeed, is just a * best effort. In the case where it does fail, perhaps because * it's in use, the unmount will fail harmlessly. */ int zfsctl_snapshot_unmount(const char *snapname, int flags) { char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL, NULL }; char *envp[] = { NULL }; zfs_snapentry_t *se; int error; rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) { rw_exit(&zfs_snapshot_lock); return (SET_ERROR(ENOENT)); } rw_exit(&zfs_snapshot_lock); exportfs_flush(); if (flags & MNT_FORCE) argv[4] = "-fn"; argv[5] = se->se_path; dprintf("unmount; path=%s\n", se->se_path); error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); zfsctl_snapshot_rele(se); /* * The umount system utility will return 256 on error. We must * assume this error is because the file system is busy so it is * converted to the more sensible EBUSY. */ if (error) error = SET_ERROR(EBUSY); return (error); } int zfsctl_snapshot_mount(struct path *path, int flags) { struct dentry *dentry = path->dentry; struct inode *ip = dentry->d_inode; zfsvfs_t *zfsvfs; zfsvfs_t *snap_zfsvfs; zfs_snapentry_t *se; char *full_name, *full_path; char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, NULL }; char *envp[] = { NULL }; int error; struct path spath; if (ip == NULL) return (SET_ERROR(EISDIR)); zfsvfs = ITOZSB(ip); if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); error = zfsctl_snapshot_name(zfsvfs, dname(dentry), ZFS_MAX_DATASET_NAME_LEN, full_name); if (error) goto error; /* * Construct a mount point path from sb of the ctldir inode and dirent * name, instead of from d_path(), so that chroot'd process doesn't fail * on mount.zfs(8). */ snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s", zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "", dname(dentry)); /* * Multiple concurrent automounts of a snapshot are never allowed. * The snapshot may be manually mounted as many times as desired. */ if (zfsctl_snapshot_ismounted(full_name)) { error = 0; goto error; } /* * Attempt to mount the snapshot from user space. Normally this * would be done using the vfs_kern_mount() function, however that * function is marked GPL-only and cannot be used. On error we * careful to log the real error to the console and return EISDIR * to safely abort the automount. This should be very rare. * * If the user mode helper happens to return EBUSY, a concurrent * mount is already in progress in which case the error is ignored. * Take note that if the program was executed successfully the return * value from call_usermodehelper() will be (exitcode << 8 + signal). */ dprintf("mount; name=%s path=%s\n", full_name, full_path); argv[5] = full_name; argv[6] = full_path; error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (error) { if (!(error & MOUNT_BUSY << 8)) { zfs_dbgmsg("Unable to automount %s error=%d", full_path, error); error = SET_ERROR(EISDIR); } else { /* * EBUSY, this could mean a concurrent mount, or the * snapshot has already been mounted at completely * different place. We return 0 so VFS will retry. For * the latter case the VFS will retry several times * and return ELOOP, which is probably not a very good * behavior. */ error = 0; } goto error; } /* * Follow down in to the mounted snapshot and set MNT_SHRINKABLE * to identify this as an automounted filesystem. */ spath = *path; path_get(&spath); if (follow_down_one(&spath)) { snap_zfsvfs = ITOZSB(spath.dentry->d_inode); snap_zfsvfs->z_parent = zfsvfs; dentry = spath.dentry; spath.mnt->mnt_flags |= MNT_SHRINKABLE; rw_enter(&zfs_snapshot_lock, RW_WRITER); se = zfsctl_snapshot_alloc(full_name, full_path, snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os), dentry); zfsctl_snapshot_add(se); zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); rw_exit(&zfs_snapshot_lock); } path_put(&spath); error: kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN); kmem_free(full_path, MAXPATHLEN); zfs_exit(zfsvfs, FTAG); return (error); } /* * Get the snapdir inode from fid */ int zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen, struct inode **ipp) { int error; struct path path; char *mnt; struct dentry *dentry; mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP); error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid, MAXPATHLEN, mnt); if (error) goto out; /* Trigger automount */ error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path); if (error) goto out; path_put(&path); /* * Get the snapdir inode. Note, we don't want to use the above * path because it contains the root of the snapshot rather * than the snapdir. */ *ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid); if (*ipp == NULL) { error = SET_ERROR(ENOENT); goto out; } /* check gen, see zfsctl_snapdir_fid */ dentry = d_obtain_alias(igrab(*ipp)); if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) { iput(*ipp); *ipp = NULL; error = SET_ERROR(ENOENT); } if (!IS_ERR(dentry)) dput(dentry); out: kmem_free(mnt, MAXPATHLEN); return (error); } int zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ITOZSB(dip); znode_t *zp; znode_t *dzp; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (zfsvfs->z_shares_dir == 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { error = zfs_lookup(dzp, name, &zp, 0, cr, NULL, NULL); zrele(dzp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Initialize the various pieces we'll need to create and manipulate .zfs * directories. Currently this is unused but available. */ void zfsctl_init(void) { avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name, sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node_name)); avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid, sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node_objsetid)); rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL); } /* * Cleanup the various pieces we needed for .zfs directories. In particular * ensure the expiry timer is canceled safely. */ void zfsctl_fini(void) { avl_destroy(&zfs_snapshots_by_name); avl_destroy(&zfs_snapshots_by_objsetid); rw_destroy(&zfs_snapshot_lock); } module_param(zfs_admin_snapshot, int, 0644); MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot"); module_param(zfs_expire_snapshot, int, 0644); MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c index 3efd4ab159c6..c2ed67c438c6 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c @@ -1,474 +1,445 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ /* * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #ifdef _KERNEL #include #include #include #include #include #include /* * Move "n" bytes at byte address "p"; "rw" indicates the direction * of the move, and the I/O parameters are provided in "uio", which is * update to reflect the data which was moved. Returns 0 on success or * a non-zero errno on failure. */ static int zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { const struct iovec *iov = uio->uio_iov; size_t skip = uio->uio_skip; ulong_t cnt; while (n && uio->uio_resid) { cnt = MIN(iov->iov_len - skip, n); switch (uio->uio_segflg) { case UIO_USERSPACE: /* * p = kernel data pointer * iov->iov_base = user data pointer */ if (rw == UIO_READ) { if (copy_to_user(iov->iov_base+skip, p, cnt)) return (EFAULT); } else { unsigned long b_left = 0; if (uio->uio_fault_disable) { if (!zfs_access_ok(VERIFY_READ, (iov->iov_base + skip), cnt)) { return (EFAULT); } pagefault_disable(); b_left = __copy_from_user_inatomic(p, (iov->iov_base + skip), cnt); pagefault_enable(); } else { b_left = copy_from_user(p, (iov->iov_base + skip), cnt); } if (b_left > 0) { unsigned long c_bytes = cnt - b_left; uio->uio_skip += c_bytes; ASSERT3U(uio->uio_skip, <, iov->iov_len); uio->uio_resid -= c_bytes; uio->uio_loffset += c_bytes; return (EFAULT); } } break; case UIO_SYSSPACE: if (rw == UIO_READ) memcpy(iov->iov_base + skip, p, cnt); else memcpy(p, iov->iov_base + skip, cnt); break; default: ASSERT(0); } skip += cnt; if (skip == iov->iov_len) { skip = 0; uio->uio_iov = (++iov); uio->uio_iovcnt--; } uio->uio_skip = skip; uio->uio_resid -= cnt; uio->uio_loffset += cnt; p = (caddr_t)p + cnt; n -= cnt; } return (0); } static int zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { const struct bio_vec *bv = uio->uio_bvec; size_t skip = uio->uio_skip; ulong_t cnt; while (n && uio->uio_resid) { void *paddr; cnt = MIN(bv->bv_len - skip, n); paddr = zfs_kmap_atomic(bv->bv_page); if (rw == UIO_READ) { /* Copy from buffer 'p' to the bvec data */ memcpy(paddr + bv->bv_offset + skip, p, cnt); } else { /* Copy from bvec data to buffer 'p' */ memcpy(p, paddr + bv->bv_offset + skip, cnt); } zfs_kunmap_atomic(paddr); skip += cnt; if (skip == bv->bv_len) { skip = 0; uio->uio_bvec = (++bv); uio->uio_iovcnt--; } uio->uio_skip = skip; uio->uio_resid -= cnt; uio->uio_loffset += cnt; p = (caddr_t)p + cnt; n -= cnt; } return (0); } #ifdef HAVE_BLK_MQ static void zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw, struct bio_vec *bv) { void *paddr; paddr = zfs_kmap_atomic(bv->bv_page); if (rw == UIO_READ) { /* Copy from buffer 'p' to the bvec data */ memcpy(paddr + bv->bv_offset + skip, p, cnt); } else { /* Copy from bvec data to buffer 'p' */ memcpy(p, paddr + bv->bv_offset + skip, cnt); } zfs_kunmap_atomic(paddr); } /* * Copy 'n' bytes of data between the buffer p[] and the data represented * by the request in the uio. */ static int zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { struct request *rq = uio->rq; struct bio_vec bv; struct req_iterator iter; size_t this_seg_start; /* logical offset */ size_t this_seg_end; /* logical offset */ size_t skip_in_seg; size_t copy_from_seg; size_t orig_loffset; int copied = 0; /* * Get the original logical offset of this entire request (because * uio->uio_loffset will be modified over time). */ orig_loffset = io_offset(NULL, rq); this_seg_start = orig_loffset; rq_for_each_segment(bv, rq, iter) { - if (uio->iter.bio) { - /* - * If uio->iter.bio is present, then we know we've saved - * uio->iter from a previous call to this function, and - * we can skip ahead in this rq_for_each_segment() loop - * to where we last left off. That way, we don't need - * to iterate over tons of segments we've already - * processed - we can just restore the "saved state". - */ - iter = uio->iter; - bv = uio->bv; - this_seg_start = uio->uio_loffset; - memset(&uio->iter, 0, sizeof (uio->iter)); - continue; - } - /* * Lookup what the logical offset of the last byte of this * segment is. */ this_seg_end = this_seg_start + bv.bv_len - 1; /* * We only need to operate on segments that have data we're * copying. */ if (uio->uio_loffset >= this_seg_start && uio->uio_loffset <= this_seg_end) { /* * Some, or all, of the data in this segment needs to be * copied. */ /* * We may be not be copying from the first byte in the * segment. Figure out how many bytes to skip copying * from the beginning of this segment. */ skip_in_seg = uio->uio_loffset - this_seg_start; /* * Calculate the total number of bytes from this * segment that we will be copying. */ copy_from_seg = MIN(bv.bv_len - skip_in_seg, n); /* Copy the bytes */ zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv); p = ((char *)p) + copy_from_seg; n -= copy_from_seg; uio->uio_resid -= copy_from_seg; uio->uio_loffset += copy_from_seg; copied = 1; /* We copied some data */ } - if (n == 0) { - /* - * All done copying. Save our 'iter' value to the uio. - * This allows us to "save our state" and skip ahead in - * the rq_for_each_segment() loop the next time we call - * call zfs_uiomove_bvec_rq() on this uio (which we - * will be doing for any remaining data in the uio). - */ - uio->iter = iter; /* make a copy of the struct data */ - uio->bv = bv; - return (0); - } - this_seg_start = this_seg_end + 1; } if (!copied) { /* Didn't copy anything */ uio->uio_resid = 0; } return (0); } #endif static int zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { #ifdef HAVE_BLK_MQ if (uio->rq != NULL) return (zfs_uiomove_bvec_rq(p, n, rw, uio)); #else ASSERT3P(uio->rq, ==, NULL); #endif return (zfs_uiomove_bvec_impl(p, n, rw, uio)); } #if defined(HAVE_VFS_IOV_ITER) static int zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, boolean_t revert) { size_t cnt = MIN(n, uio->uio_resid); if (uio->uio_skip) iov_iter_advance(uio->uio_iter, uio->uio_skip); if (rw == UIO_READ) cnt = copy_to_iter(p, cnt, uio->uio_iter); else cnt = copy_from_iter(p, cnt, uio->uio_iter); /* * When operating on a full pipe no bytes are processed. * In which case return EFAULT which is converted to EAGAIN * by the kernel's generic_file_splice_read() function. */ if (cnt == 0) return (EFAULT); /* * Revert advancing the uio_iter. This is set by zfs_uiocopy() * to avoid consuming the uio and its iov_iter structure. */ if (revert) iov_iter_revert(uio->uio_iter, cnt); uio->uio_resid -= cnt; uio->uio_loffset += cnt; return (0); } #endif int zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { if (uio->uio_segflg == UIO_BVEC) return (zfs_uiomove_bvec(p, n, rw, uio)); #if defined(HAVE_VFS_IOV_ITER) else if (uio->uio_segflg == UIO_ITER) return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE)); #endif else return (zfs_uiomove_iov(p, n, rw, uio)); } EXPORT_SYMBOL(zfs_uiomove); /* * Fault in the pages of the first n bytes specified by the uio structure. * 1 byte in each page is touched and the uio struct is unmodified. Any * error will terminate the process as this is only a best attempt to get * the pages resident. */ int zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) { if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) { /* There's never a need to fault in kernel pages */ return (0); #if defined(HAVE_VFS_IOV_ITER) } else if (uio->uio_segflg == UIO_ITER) { /* * At least a Linux 4.9 kernel, iov_iter_fault_in_readable() * can be relied on to fault in user pages when referenced. */ if (iov_iter_fault_in_readable(uio->uio_iter, n)) return (EFAULT); #endif } else { /* Fault in all user pages */ ASSERT3S(uio->uio_segflg, ==, UIO_USERSPACE); const struct iovec *iov = uio->uio_iov; int iovcnt = uio->uio_iovcnt; size_t skip = uio->uio_skip; uint8_t tmp; caddr_t p; for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) { ulong_t cnt = MIN(iov->iov_len - skip, n); /* empty iov */ if (cnt == 0) continue; n -= cnt; /* touch each page in this segment. */ p = iov->iov_base + skip; while (cnt) { if (copy_from_user(&tmp, p, 1)) return (EFAULT); ulong_t incr = MIN(cnt, PAGESIZE); p += incr; cnt -= incr; } /* touch the last byte in case it straddles a page. */ p--; if (copy_from_user(&tmp, p, 1)) return (EFAULT); } } return (0); } EXPORT_SYMBOL(zfs_uio_prefaultpages); /* * The same as zfs_uiomove() but doesn't modify uio structure. * return in cbytes how many bytes were copied. */ int zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes) { zfs_uio_t uio_copy; int ret; memcpy(&uio_copy, uio, sizeof (zfs_uio_t)); if (uio->uio_segflg == UIO_BVEC) ret = zfs_uiomove_bvec(p, n, rw, &uio_copy); #if defined(HAVE_VFS_IOV_ITER) else if (uio->uio_segflg == UIO_ITER) ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE); #endif else ret = zfs_uiomove_iov(p, n, rw, &uio_copy); *cbytes = uio->uio_resid - uio_copy.uio_resid; return (ret); } EXPORT_SYMBOL(zfs_uiocopy); /* * Drop the next n chars out of *uio. */ void zfs_uioskip(zfs_uio_t *uio, size_t n) { if (n > uio->uio_resid) return; /* * When using a uio with a struct request, we simply * use uio_loffset as a pointer to the next logical byte to * copy in the request. We don't have to do any fancy * accounting with uio_bvec/uio_iovcnt since we don't use * them. */ if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) { uio->uio_skip += n; while (uio->uio_iovcnt && uio->uio_skip >= uio->uio_bvec->bv_len) { uio->uio_skip -= uio->uio_bvec->bv_len; uio->uio_bvec++; uio->uio_iovcnt--; } #if defined(HAVE_VFS_IOV_ITER) } else if (uio->uio_segflg == UIO_ITER) { iov_iter_advance(uio->uio_iter, n); #endif } else { uio->uio_skip += n; while (uio->uio_iovcnt && uio->uio_skip >= uio->uio_iov->iov_len) { uio->uio_skip -= uio->uio_iov->iov_len; uio->uio_iov++; uio->uio_iovcnt--; } } uio->uio_loffset += n; uio->uio_resid -= n; } EXPORT_SYMBOL(zfs_uioskip); #endif /* _KERNEL */ diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index a1db5c57c18b..2792bc027213 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -1,2129 +1,2129 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" enum { TOKEN_RO, TOKEN_RW, TOKEN_SETUID, TOKEN_NOSETUID, TOKEN_EXEC, TOKEN_NOEXEC, TOKEN_DEVICES, TOKEN_NODEVICES, TOKEN_DIRXATTR, TOKEN_SAXATTR, TOKEN_XATTR, TOKEN_NOXATTR, TOKEN_ATIME, TOKEN_NOATIME, TOKEN_RELATIME, TOKEN_NORELATIME, TOKEN_NBMAND, TOKEN_NONBMAND, TOKEN_MNTPOINT, TOKEN_LAST, }; static const match_table_t zpl_tokens = { { TOKEN_RO, MNTOPT_RO }, { TOKEN_RW, MNTOPT_RW }, { TOKEN_SETUID, MNTOPT_SETUID }, { TOKEN_NOSETUID, MNTOPT_NOSETUID }, { TOKEN_EXEC, MNTOPT_EXEC }, { TOKEN_NOEXEC, MNTOPT_NOEXEC }, { TOKEN_DEVICES, MNTOPT_DEVICES }, { TOKEN_NODEVICES, MNTOPT_NODEVICES }, { TOKEN_DIRXATTR, MNTOPT_DIRXATTR }, { TOKEN_SAXATTR, MNTOPT_SAXATTR }, { TOKEN_XATTR, MNTOPT_XATTR }, { TOKEN_NOXATTR, MNTOPT_NOXATTR }, { TOKEN_ATIME, MNTOPT_ATIME }, { TOKEN_NOATIME, MNTOPT_NOATIME }, { TOKEN_RELATIME, MNTOPT_RELATIME }, { TOKEN_NORELATIME, MNTOPT_NORELATIME }, { TOKEN_NBMAND, MNTOPT_NBMAND }, { TOKEN_NONBMAND, MNTOPT_NONBMAND }, { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" }, { TOKEN_LAST, NULL }, }; static void zfsvfs_vfs_free(vfs_t *vfsp) { if (vfsp != NULL) { if (vfsp->vfs_mntpoint != NULL) kmem_strfree(vfsp->vfs_mntpoint); kmem_free(vfsp, sizeof (vfs_t)); } } static int zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp) { switch (token) { case TOKEN_RO: vfsp->vfs_readonly = B_TRUE; vfsp->vfs_do_readonly = B_TRUE; break; case TOKEN_RW: vfsp->vfs_readonly = B_FALSE; vfsp->vfs_do_readonly = B_TRUE; break; case TOKEN_SETUID: vfsp->vfs_setuid = B_TRUE; vfsp->vfs_do_setuid = B_TRUE; break; case TOKEN_NOSETUID: vfsp->vfs_setuid = B_FALSE; vfsp->vfs_do_setuid = B_TRUE; break; case TOKEN_EXEC: vfsp->vfs_exec = B_TRUE; vfsp->vfs_do_exec = B_TRUE; break; case TOKEN_NOEXEC: vfsp->vfs_exec = B_FALSE; vfsp->vfs_do_exec = B_TRUE; break; case TOKEN_DEVICES: vfsp->vfs_devices = B_TRUE; vfsp->vfs_do_devices = B_TRUE; break; case TOKEN_NODEVICES: vfsp->vfs_devices = B_FALSE; vfsp->vfs_do_devices = B_TRUE; break; case TOKEN_DIRXATTR: vfsp->vfs_xattr = ZFS_XATTR_DIR; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_SAXATTR: vfsp->vfs_xattr = ZFS_XATTR_SA; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_XATTR: vfsp->vfs_xattr = ZFS_XATTR_DIR; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_NOXATTR: vfsp->vfs_xattr = ZFS_XATTR_OFF; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_ATIME: vfsp->vfs_atime = B_TRUE; vfsp->vfs_do_atime = B_TRUE; break; case TOKEN_NOATIME: vfsp->vfs_atime = B_FALSE; vfsp->vfs_do_atime = B_TRUE; break; case TOKEN_RELATIME: vfsp->vfs_relatime = B_TRUE; vfsp->vfs_do_relatime = B_TRUE; break; case TOKEN_NORELATIME: vfsp->vfs_relatime = B_FALSE; vfsp->vfs_do_relatime = B_TRUE; break; case TOKEN_NBMAND: vfsp->vfs_nbmand = B_TRUE; vfsp->vfs_do_nbmand = B_TRUE; break; case TOKEN_NONBMAND: vfsp->vfs_nbmand = B_FALSE; vfsp->vfs_do_nbmand = B_TRUE; break; case TOKEN_MNTPOINT: vfsp->vfs_mntpoint = match_strdup(&args[0]); if (vfsp->vfs_mntpoint == NULL) return (SET_ERROR(ENOMEM)); break; default: break; } return (0); } /* * Parse the raw mntopts and return a vfs_t describing the options. */ static int zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) { vfs_t *tmp_vfsp; int error; tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); if (mntopts != NULL) { substring_t args[MAX_OPT_ARGS]; char *tmp_mntopts, *p, *t; int token; tmp_mntopts = t = kmem_strdup(mntopts); if (tmp_mntopts == NULL) return (SET_ERROR(ENOMEM)); while ((p = strsep(&t, ",")) != NULL) { if (!*p) continue; args[0].to = args[0].from = NULL; token = match_token(p, zpl_tokens, args); error = zfsvfs_parse_option(p, token, args, tmp_vfsp); if (error) { kmem_strfree(tmp_mntopts); zfsvfs_vfs_free(tmp_vfsp); return (error); } } kmem_strfree(tmp_mntopts); } *vfsp = tmp_vfsp; return (0); } boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs) { return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY)); } int zfs_sync(struct super_block *sb, int wait, cred_t *cr) { (void) cr; zfsvfs_t *zfsvfs = sb->s_fs_info; /* * Semantically, the only requirement is that the sync be initiated. * The DMU syncs out txgs frequently, so there's nothing to do. */ if (!wait) return (0); if (zfsvfs != NULL) { /* * Sync a specific filesystem. */ dsl_pool_t *dp; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (spa_suspended(dp->dp_spa)) { zfs_exit(zfsvfs, FTAG); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); zfs_exit(zfsvfs, FTAG); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; /* * Update SB_NOATIME bit in VFS super block. Since atime update is * determined by atime_needs_update(), atime_needs_update() needs to * return false if atime is turned off, and not unconditionally return * false if atime is turned on. */ if (newval) sb->s_flags &= ~SB_NOATIME; else sb->s_flags |= SB_NOATIME; } static void relatime_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_relatime = newval; } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == ZFS_XATTR_OFF) { zfsvfs->z_flags &= ~ZSB_XATTR; } else { zfsvfs->z_flags |= ZSB_XATTR; if (newval == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; else zfsvfs->z_xattr_sa = B_FALSE; } } static void acltype_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; switch (newval) { case ZFS_ACLTYPE_NFSV4: case ZFS_ACLTYPE_OFF: zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; break; case ZFS_ACLTYPE_POSIX: #ifdef CONFIG_FS_POSIX_ACL zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIX; zfsvfs->z_sb->s_flags |= SB_POSIXACL; #else zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; #endif /* CONFIG_FS_POSIX_ACL */ break; default: break; } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; if (newval) sb->s_flags |= SB_RDONLY; else sb->s_flags &= ~SB_RDONLY; } static void devices_changed_cb(void *arg, uint64_t newval) { } static void setuid_changed_cb(void *arg, uint64_t newval) { } static void exec_changed_cb(void *arg, uint64_t newval) { } static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; if (newval == TRUE) sb->s_flags |= SB_MANDLOCK; else sb->s_flags &= ~SB_MANDLOCK; } static void snapdir_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_show_ctldir = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_acl_inherit = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) { vfsp->vfs_do_readonly = B_TRUE; vfsp->vfs_readonly = B_TRUE; } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (vfsp->vfs_do_readonly) readonly_changed_cb(zfsvfs, vfsp->vfs_readonly); if (vfsp->vfs_do_setuid) setuid_changed_cb(zfsvfs, vfsp->vfs_setuid); if (vfsp->vfs_do_exec) exec_changed_cb(zfsvfs, vfsp->vfs_exec); if (vfsp->vfs_do_devices) devices_changed_cb(zfsvfs, vfsp->vfs_devices); if (vfsp->vfs_do_xattr) xattr_changed_cb(zfsvfs, vfsp->vfs_xattr); if (vfsp->vfs_do_atime) atime_changed_cb(zfsvfs, vfsp->vfs_atime); if (vfsp->vfs_do_relatime) relatime_changed_cb(zfsvfs, vfsp->vfs_relatime); if (vfsp->vfs_do_nbmand) nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } /* * Takes a dataset, a property, a value and that value's setpoint as * found in the ZAP. Checks if the property has been changed in the vfs. * If so, val and setpoint will be overwritten with updated content. * Otherwise, they are left unchanged. */ int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint) { int error; zfsvfs_t *zfvp; vfs_t *vfsp; objset_t *os; uint64_t tmp = *val; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) return (EINVAL); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); mutex_exit(&os->os_user_ptr_lock); if (zfvp == NULL) return (ESRCH); vfsp = zfvp->z_vfs; switch (zfs_prop) { case ZFS_PROP_ATIME: if (vfsp->vfs_do_atime) tmp = vfsp->vfs_atime; break; case ZFS_PROP_RELATIME: if (vfsp->vfs_do_relatime) tmp = vfsp->vfs_relatime; break; case ZFS_PROP_DEVICES: if (vfsp->vfs_do_devices) tmp = vfsp->vfs_devices; break; case ZFS_PROP_EXEC: if (vfsp->vfs_do_exec) tmp = vfsp->vfs_exec; break; case ZFS_PROP_SETUID: if (vfsp->vfs_do_setuid) tmp = vfsp->vfs_setuid; break; case ZFS_PROP_READONLY: if (vfsp->vfs_do_readonly) tmp = vfsp->vfs_readonly; break; case ZFS_PROP_XATTR: if (vfsp->vfs_do_xattr) tmp = vfsp->vfs_xattr; break; case ZFS_PROP_NBMAND: if (vfsp->vfs_do_nbmand) tmp = vfsp->vfs_nbmand; break; default: return (ENOENT); } if (tmp != *val) { if (setpoint) (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printk("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.\n", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0) return (error); zfsvfs->z_acl_type = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); if ((error == 0) && (val == ZFS_XATTR_SA)) zfsvfs->z_xattr_sa = B_TRUE; } error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 8, 1, &zfsvfs->z_projectquota_obj); if (error == ENOENT) zfsvfs->z_projectquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 8, 1, &zfsvfs->z_userobjquota_obj); if (error == ENOENT) zfsvfs->z_userobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 8, 1, &zfsvfs->z_groupobjquota_obj); if (error == ENOENT) zfsvfs->z_groupobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 8, 1, &zfsvfs->z_projectobjquota_obj); if (error == ENOENT) zfsvfs->z_projectobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); return (0); } int zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); if (error != 0) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } error = zfsvfs_create_impl(zfvp, zfsvfs, os); return (error); } /* * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function * on a failure. Do not pass in a statically allocated zfsvfs. */ int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) { int error; zfsvfs->z_vfs = NULL; zfsvfs->z_sb = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); ZFS_TEARDOWN_INIT(zfsvfs); rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1), ZFS_OBJ_MTX_MAX); zfsvfs->z_hold_size = size; zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP); zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); for (int i = 0; i != size; i++) { avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } error = zfsvfs_init(zfsvfs, os); if (error != 0) { dmu_objset_disown(os, B_TRUE, zfsvfs); *zfvp = NULL; zfsvfs_free(zfsvfs); return (error); } zfsvfs->z_drain_task = TASKQID_INVALID; zfsvfs->z_draining = B_FALSE; zfsvfs->z_drain_cancel = B_TRUE; *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; boolean_t readonly = zfs_is_readonly(zfsvfs); error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); if (error) return (error); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, &zfsvfs->z_kstat.dk_zil_sums); /* * During replay we remove the read only flag to * allow replays to succeed. */ if (readonly != 0) { readonly_changed_cb(zfsvfs, B_FALSE); } else { zap_stats_t zs; if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, &zs) == 0) { dataset_kstats_update_nunlinks_kstat( &zfsvfs->z_kstat, zs.zs_num_entries); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "num_entries in unlinked set: %llu", zs.zs_num_entries); } zfs_unlinked_drain(zfsvfs); dsl_dir_t *dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dd->dd_activity_cancelled = B_FALSE; } /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; } } /* restore readonly bit */ if (readonly != 0) readonly_changed_cb(zfsvfs, B_TRUE); } else { ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, &zfsvfs->z_kstat.dk_zil_sums); } /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); return (0); } void zfsvfs_free(zfsvfs_t *zfsvfs) { int i, size = zfsvfs->z_hold_size; zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); ZFS_TEARDOWN_DESTROY(zfsvfs); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != size; i++) { avl_destroy(&zfsvfs->z_hold_trees[i]); mutex_destroy(&zfsvfs->z_hold_locks[i]); } vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); zfsvfs_vfs_free(zfsvfs->z_vfs); dataset_kstats_destroy(&zfsvfs->z_kstat); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } #ifdef HAVE_MLSLABEL /* * Check that the hex label string is appropriate for the dataset being * mounted into the global_zone proper. * * Return an error if the hex label string is not default or * admin_low/admin_high. For admin_low labels, the corresponding * dataset must be readonly. */ int zfs_check_global_label(const char *dsname, const char *hexsl) { if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); if (strcasecmp(hexsl, ADMIN_HIGH) == 0) return (0); if (strcasecmp(hexsl, ADMIN_LOW) == 0) { /* must be readonly */ uint64_t rdonly; if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) return (SET_ERROR(EACCES)); return (rdonly ? 0 : SET_ERROR(EACCES)); } return (SET_ERROR(EACCES)); } #endif /* HAVE_MLSLABEL */ static int zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp, uint32_t bshift) { char buf[20 + DMU_OBJACCT_PREFIX_LEN]; uint64_t offset = DMU_OBJACCT_PREFIX_LEN; uint64_t quota; uint64_t used; int err; strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); err = zfs_id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, sizeof (buf) - offset, B_FALSE); if (err) return (err); if (zfsvfs->z_projectquota_obj == 0) goto objs; err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, buf + offset, 8, 1, "a); if (err == ENOENT) goto objs; else if (err) return (err); err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, buf + offset, 8, 1, &used); if (unlikely(err == ENOENT)) { uint32_t blksize; u_longlong_t nblocks; /* * Quota accounting is async, so it is possible race case. * There is at least one object with the given project ID. */ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); if (unlikely(zp->z_blksz == 0)) blksize = zfsvfs->z_max_blksz; used = blksize * nblocks; } else if (err) { return (err); } statp->f_blocks = quota >> bshift; statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0; statp->f_bavail = statp->f_bfree; objs: if (zfsvfs->z_projectobjquota_obj == 0) return (0); err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, buf + offset, 8, 1, "a); if (err == ENOENT) return (0); else if (err) return (err); err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, buf, 8, 1, &used); if (unlikely(err == ENOENT)) { /* * Quota accounting is async, so it is possible race case. * There is at least one object with the given project ID. */ used = 1; } else if (err) { return (err); } statp->f_files = quota; statp->f_ffree = (quota > used) ? (quota - used) : 0; return (0); } int zfs_statvfs(struct inode *ip, struct kstatfs *statp) { zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t refdbytes, availbytes, usedobjs, availobjs; int err = 0; if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os); /* * The underlying storage pool actually uses multiple block * size. Under Solaris frsize (fragment size) is reported as * the smallest block size we support, and bsize (block size) * as the filesystem's maximum block size. Unfortunately, * under Linux the fragment size and block size are often used * interchangeably. Thus we are forced to report both of them * as the filesystem's maximum block size. */ statp->f_frsize = zfsvfs->z_max_blksz; statp->f_bsize = zfsvfs->z_max_blksz; uint32_t bshift = fls(statp->f_bsize) - 1; /* * The following report "total" blocks of various kinds in * the file system, but reported in terms of f_bsize - the * "preferred" size. */ /* Round up so we never have a filesystem using 0 blocks. */ refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize); statp->f_blocks = (refdbytes + availbytes) >> bshift; statp->f_bfree = availbytes >> bshift; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of objects available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT); statp->f_files = statp->f_ffree + usedobjs; statp->f_fsid.val[0] = (uint32_t)fsid; statp->f_fsid.val[1] = (uint32_t)(fsid >> 32); statp->f_type = ZFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; /* * We have all of 40 characters to stuff a string here. * Is there anything useful we could/should provide? */ memset(statp->f_spare, 0, sizeof (statp->f_spare)); if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && dmu_objset_projectquota_present(zfsvfs->z_os)) { znode_t *zp = ITOZ(ip); if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid && zpl_is_valid_projid(zp->z_projid)) err = zfs_statfs_project(zfsvfs, zp, statp, bshift); } zfs_exit(zfsvfs, FTAG); return (err); } static int zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) { znode_t *rootzp; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *ipp = ZTOI(rootzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Linux kernels older than 3.1 do not support a per-filesystem shrinker. * To accommodate this we must improvise and manually walk the list of znodes * attempting to prune dentries in order to be able to drop the inodes. * * To avoid scanning the same znodes multiple times they are always rotated * to the end of the z_all_znodes list. New znodes are inserted at the * end of the list so we're always scanning the oldest znodes first. */ static int zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) { znode_t **zp_array, *zp; int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); int objects = 0; int i = 0, j = 0; zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); mutex_enter(&zfsvfs->z_znodes_lock); while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { if ((i++ > nr_to_scan) || (j >= max_array)) break; ASSERT(list_link_active(&zp->z_link_node)); list_remove(&zfsvfs->z_all_znodes, zp); list_insert_tail(&zfsvfs->z_all_znodes, zp); /* Skip active znodes and .zfs entries */ if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) continue; if (igrab(ZTOI(zp)) == NULL) continue; zp_array[j] = zp; j++; } mutex_exit(&zfsvfs->z_znodes_lock); for (i = 0; i < j; i++) { zp = zp_array[i]; ASSERT3P(zp, !=, NULL); d_prune_aliases(ZTOI(zp)); if (atomic_read(&ZTOI(zp)->i_count) == 1) objects++; zrele(zp); } vmem_free(zp_array, max_array * sizeof (znode_t *)); return (objects); } /* * The ARC has requested that the filesystem drop entries from the dentry * and inode caches. This can occur when the ARC needs to free meta data * blocks but can't because they are all pinned by entries in these caches. */ int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) { zfsvfs_t *zfsvfs = sb->s_fs_info; int error = 0; struct shrinker *shrinker = &sb->s_shrink; struct shrink_control sc = { .nr_to_scan = nr_to_scan, .gfp_mask = GFP_KERNEL, }; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); #if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ defined(SHRINK_CONTROL_HAS_NID) && \ defined(SHRINKER_NUMA_AWARE) if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { *objects = 0; for_each_online_node(sc.nid) { *objects += (*shrinker->scan_objects)(shrinker, &sc); /* * reset sc.nr_to_scan, modified by * scan_objects == super_cache_scan */ sc.nr_to_scan = nr_to_scan; } } else { *objects = (*shrinker->scan_objects)(shrinker, &sc); } #elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) *objects = (*shrinker->scan_objects)(shrinker, &sc); #elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) *objects = (*shrinker->shrink)(shrinker, &sc); #elif defined(HAVE_D_PRUNE_ALIASES) #define D_PRUNE_ALIASES_IS_DEFAULT *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); #else #error "No available dentry and inode cache pruning mechanism." #endif #if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT) #undef D_PRUNE_ALIASES_IS_DEFAULT /* * Fall back to zfs_prune_aliases if the kernel's per-superblock * shrinker couldn't free anything, possibly due to the inodes being * allocated in a different memcg. */ if (*objects == 0) *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); #endif zfs_exit(zfsvfs, FTAG); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "pruning, nr_to_scan=%lu objects=%d error=%d\n", nr_to_scan, *objects, error); return (error); } /* * Teardown the zfsvfs_t. * * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; zfs_unlinked_drain_stop_wait(zfsvfs); /* * If someone has not already unmounted this file system, * drain the zrele_taskq to ensure all active references to the * zfsvfs_t have been handled only then can it be safely destroyed. */ if (zfsvfs->z_os) { /* * If we're unmounting we have to wait for the list to * drain completely. * * If we're not unmounting there's no guarantee the list * will drain completely, but iputs run from the taskq * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * * We can safely check z_all_znodes for being empty because the * VFS has already blocked operations which add to it. */ int round = 0; while (!list_is_empty(&zfsvfs->z_all_znodes)) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) break; } } ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); if (!unmounting) { /* * We purge the parent filesystem's super block as the * parent filesystem and all of its snapshots have their * inode's super block set to the parent's filesystem's * super block. Note, 'z_parent' is self referential * for non-snapshots. */ shrink_dcache_sb(zfsvfs->z_parent->z_sb); } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no VFS ops active, and any new VFS ops * will fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * * Release all holds on dbufs. We also grab an extra reference to all * the remaining inodes so that the kernel does not attempt to free * any inodes of a suspended fs. This can cause deadlocks since the * zfs_resume_fs() process may involve starting threads, which might * attempt to free unreferenced inodes to free up memory for the new * thread. */ if (!unmounting) { mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) { if (zp->z_sa_hdl) zfs_znode_dmu_fini(zp); if (igrab(ZTOI(zp)) != NULL) zp->z_suspended = B_TRUE; } mutex_exit(&zfsvfs->z_znodes_lock); } /* * If we are unmounting, set the unmounted flag and let new VFS ops * unblock. zfs_inactive will have the unmounted behavior, and all * other VFS ops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rw_exit(&zfsvfs->z_teardown_inactive_lock); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data. We must write out any dirty data before * disowning the dataset. */ objset_t *os = zfsvfs->z_os; boolean_t os_dirty = B_FALSE; for (int t = 0; t < TXG_SIZE; t++) { if (dmu_objset_is_dirty(os, t)) { os_dirty = B_TRUE; break; } } if (!zfs_is_readonly(zfsvfs) && os_dirty) { txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); } dmu_objset_evict_dbufs(zfsvfs->z_os); dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; dsl_dir_cancel_waiters(dd); return (0); } #if defined(HAVE_SUPER_SETUP_BDI_NAME) atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0); #endif int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) { const char *osname = zm->mnt_osname; struct inode *root_inode = NULL; uint64_t recordsize; int error = 0; zfsvfs_t *zfsvfs = NULL; vfs_t *vfs = NULL; int canwrite; int dataset_visible_zone; ASSERT(zm); ASSERT(osname); dataset_visible_zone = zone_dataset_visible(osname, &canwrite); /* * Refuse to mount a filesystem if we are in a namespace and the * dataset is not visible or writable in that namespace. */ if (!INGLOBALZONE(curproc) && (!dataset_visible_zone || !canwrite)) { return (SET_ERROR(EPERM)); } error = zfsvfs_parse_options(zm->mnt_data, &vfs); if (error) return (error); /* * If a non-writable filesystem is being mounted without the * read-only flag, pretend it was set, as done for snapshots. */ if (!canwrite) - vfs->vfs_readonly = true; + vfs->vfs_readonly = B_TRUE; error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); if (error) { zfsvfs_vfs_free(vfs); goto out; } if ((error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL))) { zfsvfs_vfs_free(vfs); goto out; } vfs->vfs_data = zfsvfs; zfsvfs->z_vfs = vfs; zfsvfs->z_sb = sb; sb->s_fs_info = zfsvfs; sb->s_magic = ZFS_SUPER_MAGIC; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; sb->s_blocksize = recordsize; sb->s_blocksize_bits = ilog2(recordsize); error = -zpl_bdi_setup(sb, "zfs"); if (error) goto out; sb->s_bdi->ra_pages = 0; /* Set callback operations for the file system. */ sb->s_op = &zpl_super_operations; sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if ((error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))) goto out; xattr_changed_cb(zfsvfs, pval); if ((error = dsl_prop_get_integer(osname, "acltype", &pval, NULL))) goto out; acltype_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; zfsvfs->z_snap_defer_time = jiffies; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) goto out; } /* Allocate a root inode for the filesystem. */ error = zfs_root(zfsvfs, &root_inode); if (error) { (void) zfs_umount(sb); zfsvfs = NULL; /* avoid double-free; first in zfs_umount */ goto out; } /* Allocate a root dentry for the filesystem */ sb->s_root = d_make_root(root_inode); if (sb->s_root == NULL) { (void) zfs_umount(sb); zfsvfs = NULL; /* avoid double-free; first in zfs_umount */ error = SET_ERROR(ENOMEM); goto out; } if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb); out: if (error) { if (zfsvfs != NULL) { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } /* * make sure we don't have dangling sb->s_fs_info which * zfs_preumount will use. */ sb->s_fs_info = NULL; } return (error); } /* * Called when an unmount is requested and certain sanity checks have * already passed. At this point no dentries or inodes have been reclaimed * from their respective caches. We drop the extra reference on the .zfs * control directory to allow everything to be reclaimed. All snapshots * must already have been unmounted to reach this point. */ void zfs_preumount(struct super_block *sb) { zfsvfs_t *zfsvfs = sb->s_fs_info; /* zfsvfs is NULL when zfs_domount fails during mount */ if (zfsvfs) { zfs_unlinked_drain_stop_wait(zfsvfs); zfsctl_destroy(sb->s_fs_info); /* * Wait for zrele_async before entering evict_inodes in * generic_shutdown_super. The reason we must finish before * evict_inodes is when lazytime is on, or when zfs_purgedir * calls zfs_zget, zrele would bump i_count from 0 to 1. This * would race with the i_count check in evict_inodes. This means * it could destroy the inode while we are still using it. * * We wait for two passes. xattr directories in the first pass * may add xattr entries in zfs_purgedir, so in the second pass * we wait for them. We don't use taskq_wait here because it is * a pool wide taskq. Other mounted filesystems can constantly * do zrele_async and there's no guarantee when taskq will be * empty. */ taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); } } /* * Called once all other unmount released tear down has occurred. * It is our responsibility to release any remaining infrastructure. */ int zfs_umount(struct super_block *sb) { zfsvfs_t *zfsvfs = sb->s_fs_info; objset_t *os; if (zfsvfs->z_arc_prune != NULL) arc_remove_prune_callback(zfsvfs->z_arc_prune); VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; zpl_bdi_destroy(sb); /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, B_TRUE, zfsvfs); } zfsvfs_free(zfsvfs); sb->s_fs_info = NULL; return (0); } int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm) { zfsvfs_t *zfsvfs = sb->s_fs_info; vfs_t *vfsp; boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os); int error; if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) && !(*flags & SB_RDONLY)) { *flags |= SB_RDONLY; return (EROFS); } error = zfsvfs_parse_options(zm->mnt_data, &vfsp); if (error) return (error); if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); zfs_unregister_callbacks(zfsvfs); zfsvfs_vfs_free(zfsvfs->z_vfs); vfsp->vfs_data = zfsvfs; zfsvfs->z_vfs = vfsp; if (!issnap) (void) zfs_register_callbacks(vfsp); return (error); } int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) { zfsvfs_t *zfsvfs = sb->s_fs_info; znode_t *zp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *ipp = NULL; if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { return (SET_ERROR(EINVAL)); } /* LONG_FID_LEN means snapdirs */ if (fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); if (objsetid != ZFSCTL_INO_SNAPDIRS - object) { dprintf("snapdir fid: objsetid (%llu) != " "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n", objsetid, ZFSCTL_INO_SNAPDIRS, object); return (SET_ERROR(EINVAL)); } if (fid_gen > 1 || setgen != 0) { dprintf("snapdir fid: fid_gen (%llu) and setgen " "(%llu)\n", fid_gen, setgen); return (SET_ERROR(EINVAL)); } return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp)); } if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); /* A zero fid_gen means we are in the .zfs control directories */ if (fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { *ipp = zfsvfs->z_ctldir; ASSERT(*ipp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, 0, kcred, NULL, NULL) == 0); } else { /* * Must have an existing ref, so igrab() * cannot return NULL */ VERIFY3P(igrab(*ipp), !=, NULL); } zfs_exit(zfsvfs, FTAG); return (0); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { zfs_exit(zfsvfs, FTAG); return (err); } /* Don't export xattr stuff */ if (zp->z_pflags & ZFS_XATTR) { zrele(zp); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOENT)); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if ((fid_gen == 0) && (zfsvfs->z_root == object)) fid_gen = zp_gen; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, fid_gen); zrele(zp); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOENT)); } *ipp = ZTOI(zp); if (*ipp) zfs_znode_update_vfs(ITOZ(*ipp)); zfs_exit(zfsvfs, FTAG); return (0); } /* * Block out VFS ops and close zfsvfs_t * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err, err2; znode_t *zp; ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just update the objset_t, as the one we * had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; ds->ds_dir->dd_activity_cancelled = B_FALSE; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); zfsvfs->z_rollback_time = jiffies; /* * Attempt to re-establish all the active inodes with their * dbufs. If a zfs_rezget() fails, then we unhash the inode * and mark it stale. This prevents a collision if a new * inode/object is created which must use the same inode * number. The stale inode will be be released when the * VFS prunes the dentry holding the remaining references * on the stale inode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { err2 = zfs_rezget(zp); if (err2) { zpl_d_drop_aliases(ZTOI(zp)); remove_inode_hash(ZTOI(zp)); } /* see comment in zfs_suspend_fs() */ if (zp->z_suspended) { zfs_zrele_async(zp); zp->z_suspended = B_FALSE; } } mutex_exit(&zfsvfs->z_znodes_lock); if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) { /* * zfs_suspend_fs() could have interrupted freeing * of dnodes. We need to restart this freeing so * that we don't "leak" the space. */ zfs_unlinked_drain(zfsvfs); } /* * Most of the time zfs_suspend_fs is used for changing the contents * of the underlying dataset. ZFS rollback and receive operations * might create files for which negative dentries are present in * the cache. Since walking the dcache would require a lot of GPL-only * code duplication, it's much easier on these rather rare occasions * just to flush the whole dcache for the given dataset/filesystem. */ shrink_dcache_sb(zfsvfs->z_sb); bail: if (err != 0) zfsvfs->z_unmounted = B_TRUE; /* release the VFS ops */ rw_exit(&zfsvfs->z_teardown_inactive_lock); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); if (err != 0) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (zfsvfs->z_os) (void) zfs_umount(zfsvfs->z_sb); } return (err); } /* * Release VOPs and unmount a suspended filesystem. */ int zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); zfsvfs->z_os = os; /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); /* * Try to force unmount this file system. */ (void) zfs_umount(zfsvfs->z_sb); zfsvfs->z_unmounted = B_TRUE; return (0); } /* * Automounted snapshots rely on periodic revalidation * to defer snapshots from being automatically unmounted. */ inline void zfs_exit_fs(zfsvfs_t *zfsvfs) { if (!zfsvfs->z_issnap) return; if (time_after(jiffies, zfsvfs->z_snap_defer_time + MAX(zfs_expire_snapshot * HZ / 2, HZ))) { zfsvfs->z_snap_defer_time = jiffies; zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa, dmu_objset_id(zfsvfs->z_os), zfs_expire_snapshot); } } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY(0 == sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %llu to %llu", zfsvfs->z_version, newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. */ boolean_t zfs_get_vfs_flag_unmounted(objset_t *os) { zfsvfs_t *zfvp; boolean_t unmounted = B_FALSE; ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); if (zfvp != NULL && zfvp->z_unmounted) unmounted = B_TRUE; mutex_exit(&os->os_user_ptr_lock); return (unmounted); } void zfsvfs_update_fromname(const char *oldname, const char *newname) { /* * We don't need to do anything here, the devname is always current by * virtue of zfsvfs->z_sb->s_op->show_devname. */ (void) oldname, (void) newname; } void zfs_init(void) { zfsctl_init(); zfs_znode_init(); dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); register_filesystem(&zpl_fs_type); #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND register_fo_extend(&zpl_file_operations); #endif } void zfs_fini(void) { /* * we don't use outstanding because zpl_posix_acl_free might add more. */ taskq_wait(system_delay_taskq); taskq_wait(system_taskq); #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND unregister_fo_extend(&zpl_file_operations); #endif unregister_filesystem(&zpl_fs_type); zfs_znode_fini(); zfsctl_fini(); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_suspend_fs); EXPORT_SYMBOL(zfs_resume_fs); EXPORT_SYMBOL(zfs_set_version); EXPORT_SYMBOL(zfsvfs_create); EXPORT_SYMBOL(zfsvfs_free); EXPORT_SYMBOL(zfs_is_readonly); EXPORT_SYMBOL(zfs_domount); EXPORT_SYMBOL(zfs_preumount); EXPORT_SYMBOL(zfs_umount); EXPORT_SYMBOL(zfs_remount); EXPORT_SYMBOL(zfs_statvfs); EXPORT_SYMBOL(zfs_vget); EXPORT_SYMBOL(zfs_prune); #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 33baac9db06b..b464f615cdd3 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -1,4232 +1,4249 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using zfs_enter(zfsvfs). * A zfs_exit(zfsvfs) is needed before all returns. Any znodes * must be checked with zfs_verify_zp(zp). Both of these macros * can return EIO from the calling function. * * (2) zrele() should always be the last thing except for zil_commit() (if * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the * last reference, the vnode/znode can be freed, so the zp may point to * freed memory. Second, the last reference will call zfs_zinactive(), * which may induce a lot of work -- pushing cached pages (which acquires * range locks) and syncing out cached atime changes. Third, * zfs_zinactive() may require a new tx, which could deadlock the system * if you were already holding one. This deadlock occurs because the tx * currently being operated on prevents a txg from syncing, which * prevents the new tx from progressing, resulting in a deadlock. If you * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() * is a synonym for zrele(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * zfs_enter(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * zfs_exit(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * zil_commit(zilog, foid); // synchronous when necessary * zfs_exit(zfsvfs); // finished in zfs * return (error); // done, report error */ int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); /* Honor ZFS_APPENDONLY file attribute */ if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } /* Keep a count of the synchronous opens in the znode */ if (flag & O_SYNC) atomic_inc_32(&zp->z_sync_cnt); zfs_exit(zfsvfs, FTAG); return (0); } int zfs_close(struct inode *ip, int flag, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); /* Decrement the synchronous opens in the znode */ if (flag & O_SYNC) atomic_dec_32(&zp->z_sync_cnt); zfs_exit(zfsvfs, FTAG); return (0); } #if defined(_KERNEL) static int zfs_fillpage(struct inode *ip, struct page *pp); /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. Update all mapped * pages with the contents of the coresponding dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { struct address_space *mp = ZTOI(zp)->i_mapping; int64_t off = start & (PAGE_SIZE - 1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { uint64_t nbytes = MIN(PAGE_SIZE - off, len); struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); void *pb = kmap(pp); int error = dmu_read(os, zp->z_id, start + off, nbytes, pb + off, DMU_READ_PREFETCH); kunmap(pp); if (error) { SetPageError(pp); ClearPageUptodate(pp); } else { ClearPageError(pp); SetPageUptodate(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); } unlock_page(pp); put_page(pp); } len -= nbytes; off = 0; } } /* * When a file is memory mapped, we must keep the I/O data synchronized * between the DMU cache and the memory mapped pages. Preferentially read * from memory mapped pages, otherwise fallback to reading through the dmu. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; int64_t start = uio->uio_loffset; int64_t off = start & (PAGE_SIZE - 1); int len = nbytes; int error = 0; for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { uint64_t bytes = MIN(PAGE_SIZE - off, len); struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { /* * If filemap_fault() retries there exists a window * where the page will be unlocked and not up to date. * In this case we must try and fill the page. */ if (unlikely(!PageUptodate(pp))) { error = zfs_fillpage(ip, pp); if (error) { unlock_page(pp); put_page(pp); return (error); } } ASSERT(PageUptodate(pp) || PageDirty(pp)); unlock_page(pp); void *pb = kmap(pp); error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } #endif /* _KERNEL */ static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; /* * Write the bytes to a file. * * IN: zp - znode of file to be written to * data - bytes to write * len - number of bytes to write * pos - offset to start writing at * * OUT: resid - remaining bytes to write * * RETURN: 0 if success * positive error code if failure. EIO is returned * for a short write when residp isn't provided. * * Timestamps: * zp - ctime|mtime updated if byte count > 0 */ int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *residp) { fstrans_cookie_t cookie; int error; struct iovec iov; iov.iov_base = (void *)data; iov.iov_len = len; zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); cookie = spl_fstrans_mark(); error = zfs_write(zp, &uio, 0, kcred); spl_fstrans_unmark(cookie); if (error == 0) { if (residp != NULL) *residp = zfs_uio_resid(&uio); else if (zfs_uio_resid(&uio) != 0) error = SET_ERROR(EIO); } return (error); } static void zfs_rele_async_task(void *arg) { iput(arg); } void zfs_zrele_async(znode_t *zp) { struct inode *ip = ZTOI(zp); objset_t *os = ITOZSB(ip)->z_os; ASSERT(atomic_read(&ip->i_count) > 0); ASSERT(os != NULL); /* * If decrementing the count would put us at 0, we can't do it inline * here, because that would be synchronous. Instead, dispatch an iput * to run later. * * For more information on the dangers of a synchronous iput, see the * header comment of this file. */ if (!atomic_add_unless(&ip->i_count, -1, 1)) { VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held inode reference for it. * * IN: zdp - znode of directory to search. * nm - name of entry to lookup. * flags - LOOKUP_XATTR set if looking for an attribute. * cr - credentials of caller. * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: zpp - znode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ int zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ZTOZSB(zdp); int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (!S_ISDIR(ZTOI(zdp)->i_mode)) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *zpp = zdp; zhold(*zpp); return (0); } return (error); } } if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) return (error); *zpp = NULL; if (flags & LOOKUP_XATTR) { /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { zfs_exit(zfsvfs, FTAG); return (error); } /* * Do we have permission to get into attribute directory? */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, B_TRUE, cr, zfs_init_idmap))) { zrele(*zpp); *zpp = NULL; } zfs_exit(zfsvfs, FTAG); return (error); } if (!S_ISDIR(ZTOI(zdp)->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTDIR)); } /* * Check accessibility of directory. */ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); if ((error == 0) && (*zpp)) zfs_znode_update_vfs(*zpp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the ip of the created or trunc'd file. * * IN: dzp - znode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - file flag. * vsecp - ACL to be set * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated if new entry created * zp - ctime|mtime always, atime if new */ int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } top: *zpp = NULL; if (*name == '\0') { /* * Null component name refers to the directory itself. */ zhold(dzp); zp = dzp; dl = NULL; error = 0; } else { /* possible igrab(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); zfs_exit(zfsvfs, FTAG); return (error); } } if (zp == NULL) { uint64_t txtype; uint64_t projid = ZFS_DEFAULT_PROJID; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EINVAL); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { /* * Since, we failed to add the directory entry for it, * delete the newly created dnode. */ zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & O_APPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl) { error = SET_ERROR(EEXIST); goto out; } /* * Can't open a directory for writing. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, mnt_ns))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if (S_ISREG(ZTOI(zp)->i_mode) && (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ if (dl) { zfs_dirent_unlock(dl); dl = NULL; } error = zfs_freesp(zp, 0, 0, mode, TRUE); } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *zpp = zp; } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns) { (void) excl, (void) mode, (void) flag; znode_t *zp = NULL, *dzp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); objset_t *os; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; uint64_t projid = ZFS_DEFAULT_PROJID; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); os = zfsvfs->z_os; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } top: *ipp = NULL; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* Add to unlinked set */ zp->z_unlinked = B_TRUE; zfs_unlinked_add(zp, tx); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); out: if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *ipp = ZTOI(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove an entry from a directory. * * IN: dzp - znode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime * ip - ctime (if nlink > 0) */ static uint64_t null_xattr = 0; int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) { znode_t *zp; znode_t *xzp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; uint64_t links; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp))) { if (realnmp) pn_free(realnmp); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } /* * Need to use rmdir for removing directories. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EPERM); goto out; } mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && !zn_has_cached_data(zp, 0, LLONG_MAX); mutex_exit(&zp->z_lock); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the inode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && atomic_read(&ZTOI(zp)->i_count) == 1 && !zn_has_cached_data(zp, 0, LLONG_MAX) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; VERIFY_IMPLY(xattr_obj_unlinked, xzp); } if (delete_now) { if (xattr_obj_unlinked) { ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; clear_nlink(ZTOI(xzp)); links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } /* * Add to the unlinked set because a new reference could be * taken concurrently resulting in a deferred destruction. */ zfs_unlinked_add(zp, tx); mutex_exit(&zp->z_lock); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); if (delete_now) zrele(zp); else zfs_zrele_async(zp); if (xzp) { zfs_znode_update_vfs(xzp); zfs_zrele_async(xzp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create a new directory and insert it into dzp using the name * provided. Return a pointer to the inserted directory. * * IN: dzp - znode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * flags - case flags. * vsecp - ACL to be set * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime updated * zpp - ctime|mtime|atime updated */ int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t waited = B_FALSE; ASSERT(S_ISDIR(vap->va_mode)); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (dirname == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *zpp = NULL; if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL))) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); /* * Now put new name in parent dir. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); *zpp = zp; txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (error != 0) { zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dzp - znode of directory to remove from. * name - name of directory to be removed. * cwd - inode of current working directory. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated */ int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL))) { zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } if (!S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } if (zp == cwd) { error = SET_ERROR(EINVAL); goto out; } /* * Grab a lock on the directory to make sure that no one is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); goto top; } dmu_tx_abort(tx); zrele(zp); zfs_exit(zfsvfs, FTAG); return (error); } error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, B_FALSE); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); out: zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); zrele(zp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } /* * Read directory entries from the given directory cursor position and emit * name and position for each entry. * * IN: ip - inode of directory to read. * ctx - directory entry context. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; zap_cursor_t zc; zap_attribute_t zap; int error; uint8_t prefetch; uint8_t type; int done = 0; uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) goto out; /* * Quit if directory has been removed (posix) */ if (zp->z_unlinked) goto out; error = 0; os = zfsvfs->z_os; offset = ctx->pos; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Transform to file-system independent format */ while (!done) { uint64_t objnum; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, &zap))) { if (error == ENOENT) break; else goto update; } /* * Allow multiple entries provided the first entry is * the object id. Non-zpl consumers may safely make * use of the additional space. * * XXX: This should be a feature flag for compatibility */ if (zap.za_integer_length != 8 || zap.za_num_integers == 0) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld, " "length = %d, num = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset, zap.za_integer_length, (u_longlong_t)zap.za_num_integers); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); type = ZFS_DIRENT_TYPE(zap.za_first_integer); } done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), objnum, type); if (done) break; /* Prefetch znode */ if (prefetch) { dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); } /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } ctx->pos = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ update: zap_cursor_fini(&zc); if (error == ENOENT) error = 0; out: zfs_exit(zfsvfs, FTAG); return (error); } /* * Get the basic file attributes and place them in the provided kstat * structure. The inode is assumed to be the authoritative source * for most of the attributes. However, the znode currently has the * authoritative atime, blksize, and block count. * * IN: ip - inode of file. * * OUT: sp - kstat values. * * RETURN: 0 (always succeeds) */ int +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK +zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip, + struct kstat *sp) +#else zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) +#endif { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t blksize; u_longlong_t nblocks; int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); mutex_enter(&zp->z_lock); +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + zpl_generic_fillattr(user_ns, request_mask, ip, sp); +#else zpl_generic_fillattr(user_ns, ip, sp); +#endif /* * +1 link count for root inode with visible '.zfs' directory. */ if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) if (sp->nlink < ZFS_LINK_MAX) sp->nlink++; sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); sp->blksize = blksize; sp->blocks = nblocks; if (unlikely(zp->z_blksz == 0)) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ sp->blksize = zfsvfs->z_max_blksz; } mutex_exit(&zp->z_lock); /* * Required to prevent NFS client from detecting different inode * numbers of snapshot root dentry before and after snapshot mount. */ if (zfsvfs->z_issnap) { if (ip->i_sb->s_root->d_inode == ip) sp->ino = ZFSCTL_INO_SNAPDIRS - dmu_objset_id(zfsvfs->z_os); } zfs_exit(zfsvfs, FTAG); return (0); } /* * For the operation of changing file's user/group/project, we need to * handle not only the main object that is assigned to the file directly, * but also the ones that are used by the file via hidden xattr directory. * * Because the xattr directory may contains many EA entries, as to it may * be impossible to change all of them via the transaction of changing the * main object's user/group/project attributes. Then we have to change them * via other multiple independent transactions one by one. It may be not good * solution, but we have no better idea yet. */ static int zfs_setattr_dir(znode_t *dzp) { struct inode *dxip = ZTOI(dzp); struct inode *xip = NULL; zfsvfs_t *zfsvfs = ZTOZSB(dzp); objset_t *os = zfsvfs->z_os; zap_cursor_t zc; zap_attribute_t zap; zfs_dirlock_t *dl; znode_t *zp = NULL; dmu_tx_t *tx = NULL; uint64_t uid, gid; sa_bulk_attr_t bulk[4]; int count; int err; zap_cursor_init(&zc, os, dzp->z_id); while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { count = 0; if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { err = ENXIO; break; } err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, ZEXISTS, NULL, NULL); if (err == ENOENT) goto next; if (err) break; xip = ZTOI(zp); if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && zp->z_projid == dzp->z_projid) goto next; tx = dmu_tx_create(os); if (!(zp->z_pflags & ZFS_PROJID)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); err = dmu_tx_assign(tx, TXG_WAIT); if (err) break; mutex_enter(&dzp->z_lock); if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { xip->i_uid = dxip->i_uid; uid = zfs_uid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, sizeof (uid)); } if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { xip->i_gid = dxip->i_gid; gid = zfs_gid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, sizeof (gid)); } if (zp->z_projid != dzp->z_projid) { if (!(zp->z_pflags & ZFS_PROJID)) { zp->z_pflags |= ZFS_PROJID; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); } zp->z_projid = dzp->z_projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } mutex_exit(&dzp->z_lock); if (likely(count > 0)) { err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } tx = NULL; if (err != 0 && err != ENOENT) break; next: if (zp) { zrele(zp); zp = NULL; zfs_dirent_unlock(dl); } zap_cursor_advance(&zc); } if (tx) dmu_tx_abort(tx); if (zp) { zrele(zp); zfs_dirent_unlock(dl); } zap_cursor_fini(&zc); return (err == ENOENT ? 0 : err); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: zp - znode of file to be modified. * vap - new attribute values. * If ATTR_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * mnt_ns - user namespace of the mount * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime updated, mtime updated if size changed. */ int zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t *tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2], atime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2 = 0; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; boolean_t handle_eadir = B_FALSE; sa_bulk_attr_t *bulk, *xattr_bulk; int count = 0, xattr_count = 0, bulks = 8; if (mask == 0) return (0); if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ip = ZTOI(zp); /* * If this is a xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); if (xoap != NULL && (mask & ATTR_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } } zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & ATTR_XVATTR))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); xva_init(tmpxvattr); bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { err = SET_ERROR(EPERM); goto out3; } if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { err = SET_ERROR(EPERM); goto out3; } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (ATTR_ATIME | ATTR_MTIME)) { if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { err = SET_ERROR(EOVERFLOW); goto out3; } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfs_is_readonly(zfsvfs)) { err = SET_ERROR(EROFS); goto out3; } /* * First validate permissions */ if (mask & ATTR_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, mnt_ns); if (err) goto out3; /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) goto out3; } if (mask & (ATTR_ATIME|ATTR_MTIME) || ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr, mnt_ns); } if (mask & (ATTR_UID|ATTR_GID)) { int idmask = (mask & (ATTR_UID|ATTR_GID)); int take_owner; int take_group; uid_t uid; gid_t gid; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & ATTR_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), vap->va_uid); gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), vap->va_gid); take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && zfs_groupmember(zfsvfs, gid, cr); /* * If both ATTR_UID and ATTR_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (ATTR_UID|ATTR_GID)) && take_owner && take_group) || ((idmask == ATTR_UID) && take_owner) || ((idmask == ATTR_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr, mnt_ns) == 0) { /* * Remove setuid/setgid for non-privileged users */ (void) secpolicy_setid_clear(vap, cr); trim_mask = (mask & (ATTR_UID|ATTR_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & ATTR_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((!S_ISREG(ip->i_mode) && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); err = SET_ERROR(EPERM); goto out3; } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & ATTR_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, mnt_ns) == 0) { err = secpolicy_setid_setsticky_clear(ip, vap, &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); if (err) goto out3; trim_mask |= ATTR_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, zfs_zaccess_unix, zp); if (err) goto out3; if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { handle_eadir = B_TRUE; err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); if (err) goto out2; } if (mask & ATTR_UID) { new_kuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_kuid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & ATTR_GID) { new_kgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_kgid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) zrele(attrzp); err = EDQUOT; goto out2; } } tx = dmu_tx_create(os); if (mask & ATTR_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = EPERM; goto out; } if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (ATTR_UID|ATTR_GID)) { if (mask & ATTR_UID) { ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); new_uid = zfs_uid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); } } if (mask & ATTR_GID) { ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); new_gid = zfs_gid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); } } if (!(mask & ATTR_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & ATTR_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = ZTOI(zp)->i_mode = new_mode; ASSERT3P(aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; ZFS_TIME_ENCODE(&ip->i_atime, atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( vap->va_mtime, ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); - ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, - ZTOI(zp)); + zpl_inode_set_ctime_to_ts(ZTOI(zp), + zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } if (attrzp && mask) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & ATTR_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(S_ISREG(ip->i_mode)); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (attrzp) zrele(attrzp); if (err == ERESTART) goto top; } else { if (count > 0) err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); if (attrzp) { if (err2 == 0 && handle_eadir) err = zfs_setattr_dir(attrzp); zrele(attrzp); } zfs_znode_update_vfs(zp); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); zfs_exit(zfsvfs, FTAG); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) zfs_zrele_async(zl->zl_znode); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = ZTOZSB(zp)->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (SET_ERROR(EINVAL)); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(ZTOZSB(zp), oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdzp - Source directory containing the "old entry". * snm - Old entry name. * tdzp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * flags - case flags * rflags - RENAME_* flags * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). * mnt_ns - user namespace of the mount * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdzp,tdzp - ctime|mtime updated */ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); zilog_t *zilog; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0; int zflg = 0; boolean_t waited = B_FALSE; /* Needed for whiteout inode creation. */ boolean_t fuid_dirtied; zfs_acl_ids_t acl_ids; boolean_t have_acl = B_FALSE; znode_t *wzp = NULL; if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return (SET_ERROR(EINVAL)); /* Already checked by Linux VFS, but just to make sure. */ if (rflags & RENAME_EXCHANGE && (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) return (SET_ERROR(EINVAL)); /* * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the * right kind of vattr_t for the whiteout file. These are set * internally by ZFS so should never be incorrect. */ VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if ((error = zfs_verify_zp(tdzp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || zfsctl_is_node(ZTOI(tdzp))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ zfs_exit(zfsvfs, FTAG); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) zrele(tzp); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(snm, "..") == 0) serr = EINVAL; zfs_exit(zfsvfs, FTAG); return (serr); } if (terr) { zfs_dirent_unlock(sdl); zrele(szp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = EINVAL; zfs_exit(zfsvfs, FTAG); return (terr); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto out; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) goto out; if (S_ISDIR(ZTOI(szp)->i_mode)) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) goto out; } /* * Does target exist? */ if (tzp) { if (rflags & RENAME_NOREPLACE) { error = SET_ERROR(EEXIST); goto out; } /* * Source and target must be the same type (unless exchanging). */ if (!(rflags & RENAME_EXCHANGE)) { boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; if (s_is_dir != t_is_dir) { error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } else if (rflags & RENAME_EXCHANGE) { /* Target must exist for RENAME_EXCHANGE. */ error = SET_ERROR(ENOENT); goto out; } /* Set up inode creation for RENAME_WHITEOUT. */ if (rflags & RENAME_WHITEOUT) { /* * Whiteout files are not regular files or directories, so to * match zfs_create() we do not inherit the project id. */ uint64_t wo_projid = ZFS_DEFAULT_PROJID; error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); if (error) goto out; if (!have_acl) { error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, &acl_ids, mnt_ns); if (error) goto out; have_acl = B_TRUE; } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { error = SET_ERROR(EDQUOT); goto out; } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } if (rflags & RENAME_WHITEOUT) { dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); goto top; } dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Unlink the source. */ szp->z_pflags |= ZFS_AV_MODIFIED; if (tdzp->z_pflags & ZFS_PROJINHERIT) szp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); VERIFY0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error) goto commit; /* * Unlink the target. */ if (tzp) { int tzflg = zflg; if (rflags & RENAME_EXCHANGE) { /* This inode will be re-linked soon. */ tzflg |= ZRENAMING; tzp->z_pflags |= ZFS_AV_MODIFIED; if (sdzp->z_pflags & ZFS_PROJINHERIT) tzp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&tzp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); } error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); if (error) goto commit_link_szp; } /* * Create the new target links: * * We always link the target. * * RENAME_EXCHANGE: Link the old target to the source. * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. */ error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error) { /* * If we have removed the existing target, a subsequent call to * zfs_link_create() to add back the same entry, but with a new * dnode (szp), should not fail. */ ASSERT3P(tzp, ==, NULL); goto commit_link_tzp; } switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { case RENAME_EXCHANGE: error = zfs_link_create(sdl, tzp, tx, ZRENAMING); /* * The same argument as zfs_link_create() failing for * szp applies here, since the source directory must * have had an entry we are replacing. */ ASSERT0(error); if (error) goto commit_unlink_td_szp; break; case RENAME_WHITEOUT: zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); error = zfs_link_create(sdl, wzp, tx, ZNEW); if (error) { zfs_znode_delete(wzp, tx); remove_inode_hash(ZTOI(wzp)); goto commit_unlink_td_szp; } break; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { case RENAME_EXCHANGE: zfs_log_rename_exchange(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); break; case RENAME_WHITEOUT: zfs_log_rename_whiteout(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp, wzp); break; default: ASSERT0(rflags & ~RENAME_NOREPLACE); zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); break; } commit: dmu_tx_commit(tx); out: if (have_acl) zfs_acl_ids_free(&acl_ids); zfs_znode_update_vfs(sdzp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (sdzp != tdzp) zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zrele(szp); if (wzp) { zfs_znode_update_vfs(wzp); zrele(wzp); } if (tzp) { zfs_znode_update_vfs(tzp); zrele(tzp); } if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); /* * Clean-up path for broken link state. * * At this point we are in a (very) bad state, so we need to do our * best to correct the state. In particular, all of the nlinks are * wrong because we were destroying and creating links with ZRENAMING. * * In some form, all of these operations have to resolve the state: * * * link_destroy() *must* succeed. Fortunately, this is very likely * since we only just created it. * * * link_create()s are allowed to fail (though they shouldn't because * we only just unlinked them and are putting the entries back * during clean-up). But if they fail, we can just forcefully drop * the nlink value to (at the very least) avoid broken nlink values * -- though in the case of non-empty directories we will have to * panic (otherwise we'd have a leaked directory with a broken ..). */ commit_unlink_td_szp: VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); commit_link_tzp: if (tzp) { if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); } commit_link_szp: if (zfs_link_create(sdl, szp, tx, ZRENAMING)) VERIFY0(zfs_drop_nlink(szp, tx, NULL)); goto commit; } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dzp - Directory to contain new symbolic link. * name - Name of directory entry in dip. * vap - Attributes of new entry. * link - Name for new symlink entry. * cr - credentials of caller. * flags - case flags * mnt_ns - user namespace of the mount * * OUT: zpp - Znode for new symbolic link. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dip - ctime|mtime updated */ int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) { znode_t *zp; zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; boolean_t waited = B_FALSE; ASSERT(S_ISLNK(vap->va_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } top: *zpp = NULL; /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datasets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); } else { zrele(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by ip. * * IN: ip - inode of symbolic link * uio - structure to contain the link path. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated */ int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); zfs_exit(zfsvfs, FTAG); return (error); } /* * Insert a new entry into directory tdzp referencing szp. * * IN: tdzp - Directory to contain new entry. * szp - znode of new entry. * name - name of new entry. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * tdzp - ctime|mtime updated * szp - ctime updated */ int zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, int flags) { struct inode *sip = ZTOI(szp); znode_t *tzp; zfsvfs_t *zfsvfs = ZTOZSB(tdzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zf = ZNEW; uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; boolean_t is_tmpfile = 0; uint64_t txg; #ifdef HAVE_TMPFILE is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); #endif ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (S_ISDIR(sip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_verify_zp(szp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } if (parent == zfsvfs->z_shares_dir) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); if (error) { zfs_exit(zfsvfs, FTAG); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); if (is_tmpfile) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, tdzp); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* unmark z_unlinked so zfs_link_create will not reject */ if (is_tmpfile) szp->z_unlinked = B_FALSE; error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; /* * tmpfile is created to be in z_unlinkedobj, so remove it. * Also, we don't log in ZIL, because all previous file * operation on the tmpfile are ignored by ZIL. Instead we * always wait for txg to sync to make sure all previous * operation are sync safe. */ if (is_tmpfile) { VERIFY(zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, tdzp, szp, name); } } else if (is_tmpfile) { /* restore z_unlinked since when linking failed */ szp->z_unlinked = B_TRUE; } txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zfs_exit(zfsvfs, FTAG); return (error); } static void zfs_putpage_sync_commit_cb(void *arg) { struct page *pp = arg; ClearPageError(pp); end_page_writeback(pp); } static void zfs_putpage_async_commit_cb(void *arg) { struct page *pp = arg; znode_t *zp = ITOZ(pp->mapping->host); ClearPageError(pp); end_page_writeback(pp); atomic_dec_32(&zp->z_async_writes_cnt); } /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. * * IN: ip - page mapped for inode. * pp - page to push (page is locked) * wbc - writeback control data * for_sync - does the caller intend to wait synchronously for the * page writeback to complete? * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated */ int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, boolean_t for_sync) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t offset; loff_t pgoff; unsigned int pglen; dmu_tx_t *tx; caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; + inode_timespec_t tmp_ctime; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ASSERT(PageLocked(pp)); pgoff = page_offset(pp); /* Page byte-offset in file */ offset = i_size_read(ip); /* File length in bytes */ pglen = MIN(PAGE_SIZE, /* Page length in bytes */ P2ROUNDUP(offset, PAGE_SIZE)-pgoff); /* Page is beyond end of file */ if (pgoff >= offset) { unlock_page(pp); zfs_exit(zfsvfs, FTAG); return (0); } /* Truncate page length to end of file */ if (pgoff + pglen > offset) pglen = offset - pgoff; #if 0 /* * FIXME: Allow mmap writes past its quota. The correct fix * is to register a page_mkwrite() handler to count the page * against its quota when it is about to be dirtied. */ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, KUID_TO_SUID(ip->i_uid)) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, KGID_TO_SGID(ip->i_gid)) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { err = EDQUOT; } #endif /* * The ordering here is critical and must adhere to the following * rules in order to avoid deadlocking in either zfs_read() or * zfs_free_range() due to a lock inversion. * * 1) The page must be unlocked prior to acquiring the range lock. * This is critical because zfs_read() calls find_lock_page() * which may block on the page lock while holding the range lock. * * 2) Before setting or clearing write back on a page the range lock * must be held in order to prevent a lock inversion with the * zfs_free_range() function. * * This presents a problem because upon entering this function the * page lock is already held. To safely acquire the range lock the * page lock must be dropped. This creates a window where another * process could truncate, invalidate, dirty, or write out the page. * * Therefore, after successfully reacquiring the range and page locks * the current page state is checked. In the common case everything * will be as is expected and it can be written out. However, if * the page state has changed it must be handled accordingly. */ mapping = pp->mapping; redirty_page_for_writepage(wbc, pp); unlock_page(pp); zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, pgoff, pglen, RL_WRITER); lock_page(pp); /* Page mapping changed or it was no longer dirty, we're done */ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (0); } /* Another process started write block if required */ if (PageWriteback(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Speed up any non-sync page writebacks since * they may take several seconds to complete. * Refer to the comment in zpl_fsync() (when * HAVE_FSYNC_RANGE is defined) for details. */ if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { zil_commit(zfsvfs->z_log, zp->z_id); } if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); #else wait_on_page_bit(pp, PG_writeback); #endif } zfs_exit(zfsvfs, FTAG); return (0); } /* Clear the dirty flag the required locks are held */ if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (0); } /* * Counterpart for redirty_page_for_writepage() above. This page * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; if (!for_sync) atomic_inc_32(&zp->z_async_writes_cnt); set_page_writeback(pp); unlock_page(pp); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) dmu_tx_wait(tx); dmu_tx_abort(tx); #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO filemap_dirty_folio(page_mapping(pp), page_folio(pp)); #else __set_page_dirty_nobuffers(pp); #endif ClearPageError(pp); end_page_writeback(pp); if (!for_sync) atomic_dec_32(&zp->z_async_writes_cnt); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (err); } va = kmap(pp); ASSERT3U(pglen, <=, PAGE_SIZE); dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); kunmap(pp); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + tmp_ctime = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ctime, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, for_sync ? zfs_putpage_sync_commit_cb : zfs_putpage_async_commit_cb, pp); dmu_tx_commit(tx); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Note that this is rarely called under writepages(), because * writepages() normally handles the entire commit for * performance reasons. */ zil_commit(zfsvfs->z_log, zp->z_id); } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { /* * If the caller does not intend to wait synchronously * for this page writeback to complete and there are active * synchronous calls on this file, do a commit so that * the latter don't accidentally end up waiting for * our writeback to complete. Refer to the comment in * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. */ zil_commit(zfsvfs->z_log, zp->z_id); } dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); zfs_exit(zfsvfs, FTAG); return (err); } /* * Update the system attributes when the inode has been dirtied. For the * moment we only update the mode, atime, mtime, and ctime. */ int zfs_dirty_inode(struct inode *ip, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; + inode_timespec_t tmp_ctime; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return (0); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); #ifdef I_DIRTY_TIME /* * This is the lazytime semantic introduced in Linux 4.0 * This flag will only be called from update_time when lazytime is set. * (Note, I_DIRTY_SYNC will also set if not lazytime) * Fortunately mtime and ctime are managed within ZFS itself, so we * only need to dirty atime. */ if (flags == I_DIRTY_TIME) { zp->z_atime_dirty = B_TRUE; goto out; } #endif tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } mutex_enter(&zp->z_lock); zp->z_atime_dirty = B_FALSE; SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_atime, atime); ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + tmp_ctime = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ctime, ctime); mode = ip->i_mode; zp->z_mode = mode; error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); mutex_exit(&zp->z_lock); dmu_tx_commit(tx); out: zfs_exit(zfsvfs, FTAG); return (error); } void zfs_inactive(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t atime[2]; int error; int need_unlock = 0; /* Only read lock if we haven't already write locked, e.g. rollback */ if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { need_unlock = 1; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); } if (zp->z_sa_hdl == NULL) { if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); return; } if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { ZFS_TIME_ENCODE(&ip->i_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); zp->z_atime_dirty = B_FALSE; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * Fill pages with data from the disk. */ static int zfs_fillpage(struct inode *ip, struct page *pp) { zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; ASSERT3U(io_off, <, i_size); if (io_off + io_len > i_size) io_len = i_size - io_off; void *va = kmap(pp); int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, io_len, va, DMU_READ_PREFETCH); if (io_len != PAGE_SIZE) memset((char *)va + io_len, 0, PAGE_SIZE - io_len); kunmap(pp); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); SetPageError(pp); ClearPageUptodate(pp); } else { ClearPageError(pp); SetPageUptodate(pp); } return (error); } /* * Uses zfs_fillpage to read data from the file and fill the page. * * IN: ip - inode of file to get data from. * pp - page to read * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ int zfs_getpage(struct inode *ip, struct page *pp) { zfsvfs_t *zfsvfs = ITOZSB(ip); znode_t *zp = ITOZ(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); error = zfs_fillpage(ip, pp); if (error == 0) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); zfs_exit(zfsvfs, FTAG); return (error); } /* * Check ZFS specific permissions to memory map a section of a file. * * IN: ip - inode of the file to mmap * off - file offset * addrp - start address in memory region * len - length of memory region * vm_flags- address flags * * RETURN: 0 if success * error code if failure */ int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags) { (void) addrp; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - if ((vm_flags & VM_WRITE) && (zp->z_pflags & - (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { + if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) && + (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((vm_flags & (VM_READ | VM_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EACCES)); } if (off < 0 || len > MAXOFFSET_T - off) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENXIO)); } zfs_exit(zfsvfs, FTAG); return (0); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: zp - znode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * zp - ctime|mtime updated */ int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { (void) offset; zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t off, len; int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if (cmd != F_FREESP) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOSPC)); } if ((error = zfs_verify_zp(zp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } gen = (uint32_t)gen64; size = SHORT_FID_LEN; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); zfs_exit(zfsvfs, FTAG); return (0); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); EXPORT_SYMBOL(zfs_lookup); EXPORT_SYMBOL(zfs_create); EXPORT_SYMBOL(zfs_tmpfile); EXPORT_SYMBOL(zfs_remove); EXPORT_SYMBOL(zfs_mkdir); EXPORT_SYMBOL(zfs_rmdir); EXPORT_SYMBOL(zfs_readdir); EXPORT_SYMBOL(zfs_getattr_fast); EXPORT_SYMBOL(zfs_setattr); EXPORT_SYMBOL(zfs_rename); EXPORT_SYMBOL(zfs_symlink); EXPORT_SYMBOL(zfs_readlink); EXPORT_SYMBOL(zfs_link); EXPORT_SYMBOL(zfs_inactive); EXPORT_SYMBOL(zfs_space); EXPORT_SYMBOL(zfs_fid); EXPORT_SYMBOL(zfs_getpage); EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); EXPORT_SYMBOL(zfs_map); /* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); +/* CSTYLED */ +module_param(zfs_bclone_enabled, uint, 0644); +MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning"); + #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c index 52c8e51df659..f71026da83cb 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c @@ -1,2358 +1,2364 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2007 Jeremy Teo */ #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL static kmem_cache_t *znode_cache = NULL; static kmem_cache_t *znode_hold_cache = NULL; unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; /* * This is used by the test suite so that it can delay znodes from being * freed in order to inspect the unlinked set. */ static int zfs_unlink_suspend_progress = 0; /* * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on * z_rangelock. It will modify the offset and length of the lock to reflect * znode-specific information, and convert RL_APPEND to RL_WRITER. This is * called with the rangelock_t's rl_lock held, which avoids races. */ static void zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) { znode_t *zp = arg; /* * If in append mode, convert to writer and lock starting at the * current end of file. */ if (new->lr_type == RL_APPEND) { new->lr_offset = zp->z_size; new->lr_type = RL_WRITER; } /* * If we need to grow the block size then lock the whole file range. */ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { new->lr_offset = 0; new->lr_length = UINT64_MAX; } } static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_t *zp = buf; inode_init_once(ZTOI(zp)); list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; return (0); } static void zfs_znode_cache_destructor(void *buf, void *arg) { (void) arg; znode_t *zp = buf; ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } static int zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_hold_t *zh = buf; mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); zh->zh_refcount = 0; return (0); } static void zfs_znode_hold_cache_destructor(void *buf, void *arg) { (void) arg; znode_hold_t *zh = buf; mutex_destroy(&zh->zh_lock); } void zfs_znode_init(void) { /* * Initialize zcache. The KMC_SLAB hint is used in order that it be * backed by kmalloc() when on the Linux slab in order that any * wait_on_bit() operations on the related inode operate properly. */ ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); ASSERT(znode_hold_cache == NULL); znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); } void zfs_znode_fini(void) { /* * Cleanup zcache */ if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; if (znode_hold_cache) kmem_cache_destroy(znode_hold_cache); znode_hold_cache = NULL; } /* * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to * serialize access to a znode and its SA buffer while the object is being * created or destroyed. This kind of locking would normally reside in the * znode itself but in this case that's impossible because the znode and SA * buffer may not yet exist. Therefore the locking is handled externally * with an array of mutexes and AVLs trees which contain per-object locks. * * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted * in to the correct AVL tree and finally the per-object lock is held. In * zfs_znode_hold_exit() the process is reversed. The per-object lock is * released, removed from the AVL tree and destroyed if there are no waiters. * * This scheme has two important properties: * * 1) No memory allocations are performed while holding one of the z_hold_locks. * This ensures evict(), which can be called from direct memory reclaim, will * never block waiting on a z_hold_locks which just happens to have hashed * to the same index. * * 2) All locks used to serialize access to an object are per-object and never * shared. This minimizes lock contention without creating a large number * of dedicated locks. * * On the downside it does require znode_lock_t structures to be frequently * allocated and freed. However, because these are backed by a kmem cache * and very short lived this cost is minimal. */ int zfs_znode_hold_compare(const void *a, const void *b) { const znode_hold_t *zh_a = (const znode_hold_t *)a; const znode_hold_t *zh_b = (const znode_hold_t *)b; return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); } static boolean_t __maybe_unused zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t held; search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; mutex_exit(&zfsvfs->z_hold_locks[i]); return (held); } znode_hold_t * zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, *zh_new, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t found = B_FALSE; zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); if (likely(zh == NULL)) { zh = zh_new; zh->zh_obj = obj; avl_add(&zfsvfs->z_hold_trees[i], zh); } else { ASSERT3U(zh->zh_obj, ==, obj); found = B_TRUE; } zh->zh_refcount++; ASSERT3S(zh->zh_refcount, >, 0); mutex_exit(&zfsvfs->z_hold_locks[i]); if (found == B_TRUE) kmem_cache_free(znode_hold_cache, zh_new); ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); mutex_enter(&zh->zh_lock); return (zh); } void zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) { int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); boolean_t remove = B_FALSE; ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); mutex_exit(&zh->zh_lock); mutex_enter(&zfsvfs->z_hold_locks[i]); ASSERT3S(zh->zh_refcount, >, 0); if (--zh->zh_refcount == 0) { avl_remove(&zfsvfs->z_hold_trees[i], zh); remove = B_TRUE; } mutex_exit(&zfsvfs->z_hold_locks[i]); if (remove == B_TRUE) kmem_cache_free(znode_hold_cache, zh); } dev_t zfs_cmpldev(uint64_t dev) { return (dev); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); mutex_enter(&zp->z_lock); ASSERT(zp->z_sa_hdl == NULL); ASSERT(zp->z_acl_cached == NULL); if (sa_hdl == NULL) { VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; mutex_exit(&zp->z_lock); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } /* * Called by new_inode() to allocate a new inode. */ int zfs_inode_alloc(struct super_block *sb, struct inode **ip) { znode_t *zp; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); *ip = ZTOI(zp); return (0); } /* * Called in multiple places when an inode should be destroyed. */ void zfs_inode_destroy(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); mutex_enter(&zfsvfs->z_znodes_lock); if (list_link_active(&zp->z_link_node)) { list_remove(&zfsvfs->z_all_znodes, zp); } mutex_exit(&zfsvfs->z_znodes_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } kmem_cache_free(znode_cache, zp); } static void zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) { uint64_t rdev = 0; switch (ip->i_mode & S_IFMT) { case S_IFREG: ip->i_op = &zpl_inode_operations; #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND ip->i_fop = &zpl_file_operations.kabi_fops; #else ip->i_fop = &zpl_file_operations; #endif ip->i_mapping->a_ops = &zpl_address_space_operations; break; case S_IFDIR: #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER ip->i_flags |= S_IOPS_WRAPPER; ip->i_op = &zpl_dir_inode_operations.ops; #else ip->i_op = &zpl_dir_inode_operations; #endif ip->i_fop = &zpl_dir_file_operations; ITOZ(ip)->z_zn_prefetch = B_TRUE; break; case S_IFLNK: ip->i_op = &zpl_symlink_inode_operations; break; /* * rdev is only stored in a SA only for device files. */ case S_IFCHR: case S_IFBLK: (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, sizeof (rdev)); zfs_fallthrough; case S_IFIFO: case S_IFSOCK: init_special_inode(ip, ip->i_mode, rdev); ip->i_op = &zpl_special_inode_operations; break; default: zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", (u_longlong_t)ip->i_ino, ip->i_mode); /* Assume the inode is a file and attempt to continue */ ip->i_mode = S_IFREG | 0644; ip->i_op = &zpl_inode_operations; #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND ip->i_fop = &zpl_file_operations.kabi_fops; #else ip->i_fop = &zpl_file_operations; #endif ip->i_mapping->a_ops = &zpl_address_space_operations; break; } } static void zfs_set_inode_flags(znode_t *zp, struct inode *ip) { /* * Linux and Solaris have different sets of file attributes, so we * restrict this conversion to the intersection of the two. */ #ifdef HAVE_INODE_SET_FLAGS unsigned int flags = 0; if (zp->z_pflags & ZFS_IMMUTABLE) flags |= S_IMMUTABLE; if (zp->z_pflags & ZFS_APPENDONLY) flags |= S_APPEND; inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); #else if (zp->z_pflags & ZFS_IMMUTABLE) ip->i_flags |= S_IMMUTABLE; else ip->i_flags &= ~S_IMMUTABLE; if (zp->z_pflags & ZFS_APPENDONLY) ip->i_flags |= S_APPEND; else ip->i_flags &= ~S_APPEND; #endif } /* * Update the embedded inode given the znode. */ void zfs_znode_update_vfs(znode_t *zp) { struct inode *ip; uint32_t blksize; u_longlong_t i_blocks; ASSERT(zp != NULL); ip = ZTOI(zp); /* Skip .zfs control nodes which do not exist on disk. */ if (zfsctl_is_node(ip)) return; dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); spin_lock(&ip->i_lock); ip->i_mode = zp->z_mode; ip->i_blocks = i_blocks; i_size_write(ip, zp->z_size); spin_unlock(&ip->i_lock); } /* * Construct a znode+inode and initialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; struct inode *ip; uint64_t mode; uint64_t parent; uint64_t tmp_gen; uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; + inode_timespec_t tmp_ctime; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; ASSERT(zfsvfs != NULL); ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); zp = ITOZ(ip); ASSERT(zp->z_dirlocks == NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; #if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) zp->z_is_mapped = B_FALSE; #endif zp->z_is_ctldir = B_FALSE; zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || (dmu_objset_projectquota_enabled(zfsvfs->z_os) && (zp->z_pflags & ZFS_PROJID) && sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; goto error; } zp->z_projid = projid; zp->z_mode = ip->i_mode = mode; ip->i_generation = (uint32_t)tmp_gen; ip->i_blkbits = SPA_MINBLOCKSHIFT; set_nlink(ip, (uint32_t)links); zfs_uid_write(ip, z_uid); zfs_gid_write(ip, z_gid); zfs_set_inode_flags(zp, ip); /* Cache the xattr parent id */ if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; ZFS_TIME_DECODE(&ip->i_atime, atime); ZFS_TIME_DECODE(&ip->i_mtime, mtime); - ZFS_TIME_DECODE(&ip->i_ctime, ctime); + ZFS_TIME_DECODE(&tmp_ctime, ctime); + zpl_inode_set_ctime_to_ts(ip, tmp_ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; zfs_znode_update_vfs(zp); zfs_inode_set_ops(zfsvfs, ip); /* * The only way insert_inode_locked() can fail is if the ip->i_ino * number is already hashed for this super block. This can never * happen because the inode numbers map 1:1 with the object numbers. * * Exceptions include rolling back a mounted file system, either * from the zfs rollback or zfs recv command. * * Active inodes are unhashed during the rollback, but since zrele * can happen asynchronously, we can't guarantee they've been * unhashed. This can cause hash collisions in unlinked drain * processing so do not hash unlinked znodes. */ if (links > 0) VERIFY3S(insert_inode_locked(ip), ==, 0); mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); if (links > 0) unlock_new_inode(ip); return (zp); error: iput(ip); return (NULL); } /* * Safely mark an inode dirty. Inodes which are part of a read-only * file system or snapshot may not be dirtied. */ void zfs_mark_inode_dirty(struct inode *ip) { zfsvfs_t *zfsvfs = ITOZSB(ip); if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return; mark_inode_dirty(ip); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_TMPFILE - new object is of O_TMPFILE * IS_XATTR - new object is an attribute * acl_ids - ACL related attributes * * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t projid = ZFS_DEFAULT_PROJID; uint64_t rdev = 0; zfsvfs_t *zfsvfs = ZTOZSB(dzp); dmu_buf_t *db; inode_timespec_t now; uint64_t gen, obj; int bonuslen; int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; znode_hold_t *zh; if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } if (dnodesize == 0) dnodesize = DNODE_MIN_SIZE; obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (S_ISDIR(vap->va_mode)) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx)); } else { obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx)); } else { obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx); } } zh = zfs_znode_hold_enter(zfsvfs, obj); VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } /* * If parent is an xattr, so am I. */ if (dzp->z_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (S_ISDIR(vap->va_mode)) { size = 2; /* contents ("." and "..") */ links = 2; } else { size = 0; links = (flag & IS_TMPFILE) ? 0 : 1; } if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) rdev = vap->va_rdev; parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { /* * With ZFS_PROJID flag, we can easily know whether there is * project ID stored on disk or not. See zfs_space_delta_cb(). */ if (obj_type != DMU_OT_ZNODE && dmu_objset_projectquota_enabled(zfsvfs->z_os)) pflags |= ZFS_PROJID; /* * Inherit project ID from parent if required. */ projid = zfs_inherit_projid(dzp); if (dzp->z_pflags & ZFS_PROJINHERIT) pflags |= ZFS_PROJINHERIT; } /* * No execs denied will be determined when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & ATTR_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & ATTR_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && pflags & ZFS_PROJID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8); } if (obj_type == DMU_OT_ZNODE || (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); if (!(flag & IS_ROOT_NODE)) { /* * The call to zfs_znode_alloc() may fail if memory is low * via the call path: alloc_inode() -> inode_init_always() -> * security_inode_alloc() -> inode_alloc_security(). Since * the existing code is written such that zfs_mknode() can * not fail retry until sufficient memory has been reclaimed. */ do { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); } while (*zpp == NULL); VERIFY(*zpp != NULL); VERIFY(dzp != NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; (*zpp)->z_dnodesize = dnodesize; (*zpp)->z_projid = projid; if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); zfs_znode_hold_exit(zfsvfs, zh); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; boolean_t update_inode = B_FALSE; xoap = xva_getxoptattr(xvap); ASSERT(xoap); if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_PROJINHERIT); } if (update_inode) zfs_set_inode_flags(zp, ZTOI(zp)); } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; znode_hold_t *zh; int err; sa_handle_t *hdl; *zpp = NULL; again: zh = zfs_znode_hold_enter(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); /* * If zp->z_unlinked is set, the znode is already marked * for deletion and should not be discovered. Check this * after checking igrab() due to fsetxattr() & O_TMPFILE. * * If igrab() returns NULL the VFS has independently * determined the inode should be evicted and has * called iput_final() to start the eviction process. * The SA handle is still valid but because the VFS * requires that the eviction succeed we must drop * our locks and references to allow the eviction to * complete. The zfs_zget() may then be retried. * * This unlikely case could be optimized by registering * a sops->drop_inode() callback. The callback would * need to detect the active SA hold thereby informing * the VFS that this inode should not be evicted. */ if (igrab(ZTOI(zp)) == NULL) { if (zp->z_unlinked) err = SET_ERROR(ENOENT); else err = SET_ERROR(EAGAIN); } else { *zpp = zp; err = 0; } mutex_exit(&zp->z_lock); sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); if (err == EAGAIN) { /* inode might need this to finish evict */ cond_resched(); goto again; } return (err); } /* * Not found create new znode/vnode but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } zfs_znode_hold_exit(zfsvfs, zh); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_object_info_t doi; dmu_buf_t *db; uint64_t obj_num = zp->z_id; uint64_t mode; uint64_t links; sa_bulk_attr_t bulk[11]; int err; int count = 0; uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; + inode_timespec_t tmp_ctime; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; /* * skip ctldir, otherwise they will always get invalidated. This will * cause funny behaviour for the mounted snapdirs. Especially for * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent * anyone automount it again as long as someone is still using the * detached mount. */ if (zp->z_is_ctldir) return (0); zh = zfs_znode_hold_enter(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } rw_exit(&zp->z_xattr_lock); ASSERT(zp->z_sa_hdl == NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, sizeof (z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, sizeof (z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8); if (err != 0 && err != ENOENT) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(err)); } } zp->z_projid = projid; zp->z_mode = ZTOI(zp)->i_mode = mode; zfs_uid_write(ZTOI(zp), z_uid); zfs_gid_write(ZTOI(zp), z_gid); ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); - ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); + ZFS_TIME_DECODE(&tmp_ctime, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } set_nlink(ZTOI(zp), (uint32_t)links); zfs_set_inode_flags(zp, ZTOI(zp)); zp->z_blksz = doi.doi_data_block_size; zp->z_atime_dirty = B_FALSE; zfs_znode_update_vfs(zp); /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the * stale data and to prevent automatic removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. */ zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); if (zp->z_unlinked) zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); znode_hold_t *zh; zh = zfs_znode_hold_enter(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY(0 == dmu_object_free(os, acl_obj, tx)); } VERIFY(0 == dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t z_id = zp->z_id; znode_hold_t *zh; ASSERT(zp->z_sa_hdl); /* * Don't allow a zfs_zget() while were trying to release this znode. */ zh = zfs_znode_hold_enter(zfsvfs, z_id); mutex_enter(&zp->z_lock); /* * If this was the last reference to a file with no links, remove * the file from the file system unless the file system is mounted * read-only. That can happen, for example, if the file system was * originally read-write, the file was opened, then unlinked and * the file system was made read-only before the file was finally * closed. The file will remain in the unlinked set. */ if (zp->z_unlinked) { ASSERT(!zfsvfs->z_issnap); if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { mutex_exit(&zp->z_lock); zfs_znode_hold_exit(zfsvfs, zh); zfs_rmnode(zp); return; } } mutex_exit(&zp->z_lock); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } #if defined(HAVE_INODE_TIMESPEC64_TIMES) #define zfs_compare_timespec timespec64_compare #else #define zfs_compare_timespec timespec_compare #endif /* * Determine whether the znode's atime must be updated. The logic mostly * duplicates the Linux kernel's relatime_need_update() functionality. * This function is only called if the underlying filesystem actually has * atime updates enabled. */ boolean_t zfs_relatime_need_update(const struct inode *ip) { - inode_timespec_t now; + inode_timespec_t now, tmp_ctime; gethrestime(&now); /* * In relatime mode, only update the atime if the previous atime * is earlier than either the ctime or mtime or if at least a day * has passed since the last update of atime. */ if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) return (B_TRUE); - if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) + tmp_ctime = zpl_inode_get_ctime(ip); + if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0) return (B_TRUE); if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) return (B_TRUE); return (B_FALSE); } /* * Prepare to update znode time stamps. * * IN: zp - znode requiring timestamp update * flag - ATTR_MTIME, ATTR_CTIME flags * * OUT: zp - z_seq * mtime - new mtime * ctime - new ctime * * Note: We don't update atime here, because we rely on Linux VFS to do * atime updating. */ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { - inode_timespec_t now; + inode_timespec_t now, tmp_ctime; gethrestime(&now); zp->z_seq++; if (flag & ATTR_MTIME) { ZFS_TIME_ENCODE(&now, mtime); ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); if (ZTOZSB(zp)->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); - ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); + ZFS_TIME_DECODE(&tmp_ctime, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), &zp->z_size, sizeof (zp->z_size), tx)); zfs_rangelock_exit(lr); dmu_tx_commit(tx); return (0); } /* * zfs_zero_partial_page - Modeled after update_pages() but * with different arguments and semantics for use by zfs_freesp(). * * Zeroes a piece of a single page cache entry for zp at offset * start and length len. * * Caller must acquire a range lock on the file for the region * being zeroed in order that the ARC and page cache stay in sync. */ static void zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) { struct address_space *mp = ZTOI(zp)->i_mapping; struct page *pp; int64_t off; void *pb; ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); off = start & (PAGE_SIZE - 1); start &= PAGE_MASK; pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); pb = kmap(pp); memset(pb + off, 0, len); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); unlock_page(pp); put_page(pp); } } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zfs_locked_range_t *lr; int error; /* * Lock the range being freed. */ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); /* * Zero partial page cache entries. This must be done under a * range lock in order to keep the ARC and page cache in sync. */ if (zn_has_cached_data(zp, off, off + len - 1)) { loff_t first_page, last_page, page_len; loff_t first_page_offset, last_page_offset; /* first possible full page in hole */ first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; /* last page of hole */ last_page = (off + len) >> PAGE_SHIFT; /* offset of first_page */ first_page_offset = first_page << PAGE_SHIFT; /* offset of last_page */ last_page_offset = last_page << PAGE_SHIFT; /* truncate whole pages */ if (last_page_offset > first_page_offset) { truncate_inode_pages_range(ZTOI(zp)->i_mapping, first_page_offset, last_page_offset - 1); } /* truncate sub-page ranges */ if (first_page > last_page) { /* entire punched area within a single page */ zfs_zero_partial_page(zp, off, len); } else { /* beginning of punched area at the end of a page */ page_len = first_page_offset - off; if (page_len > 0) zfs_zero_partial_page(zp, off, page_len); /* end of punched area at the beginning of a page */ page_len = off + len - last_page_offset; if (page_len > 0) zfs_zero_partial_page(zp, last_page_offset, page_len); } } zfs_rangelock_exit(lr); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { zfs_rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); dmu_tx_commit(tx); zfs_rangelock_exit(lr); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(zp); zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; goto out; } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) goto out; log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); zfs_znode_update_vfs(zp); error = 0; out: /* * Truncate the page cache - for file truncate operations, use * the purpose-built API for truncations. For punching operations, * the truncation is handled under a range lock in zfs_free_range. */ if (len == 0) truncate_setsize(ZTOI(zp), off); return (error); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { struct super_block *sb; zfsvfs_t *zfsvfs; uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int size; int error; int i; znode_t *rootzp = NULL; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT(error == 0); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; const char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT(error == 0); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); ASSERT(error == 0); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT(error == 0); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* * Create root znode. Create minimal znode/inode/zfsvfs/sb * to allow zfs_mknode to work. */ vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); rootzp->z_unlinked = B_FALSE; rootzp->z_atime_dirty = B_FALSE; rootzp->z_is_sa = USE_SA(version, os); rootzp->z_pflags = 0; zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); sb->s_fs_info = zfsvfs; ZTOI(rootzp)->i_sb = sb; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT(error == 0); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); zfsvfs->z_hold_size = size; zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP); zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); for (i = 0; i != size; i++) { avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids, zfs_init_idmap)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); zfs_acl_ids_free(&acl_ids); atomic_set(&ZTOI(rootzp)->i_count, 0); sa_handle_destroy(rootzp->z_sa_hdl); kmem_cache_free(znode_cache, rootzp); for (i = 0; i != size; i++) { avl_destroy(&zfsvfs->z_hold_trees[i]); mutex_destroy(&zfsvfs->z_hold_locks[i]); } mutex_destroy(&zfsvfs->z_znodes_lock); vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); kmem_free(sb, sizeof (struct super_block)); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } #endif /* _KERNEL */ static int zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) { uint64_t sa_obj = 0; int error; error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0 && error != ENOENT) return (error); error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); return (error); } static int zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, dmu_buf_t **db, const void *tag) { dmu_object_info_t doi; int error; if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) return (error); dmu_object_info_from_db(*db, &doi); if ((doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t))) { sa_buf_rele(*db, tag); return (SET_ERROR(ENOTSUP)); } error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); if (error != 0) { sa_buf_rele(*db, tag); return (error); } return (0); } static void zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag) { sa_handle_destroy(hdl); sa_buf_rele(db, tag); } /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; uint64_t parent_mode; sa_bulk_attr_t bulk[3]; sa_handle_t *sa_hdl; dmu_buf_t *sa_db; int count = 0; int error; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, &parent, sizeof (parent)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, &pflags, sizeof (pflags)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &mode, sizeof (mode)); if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); /* * When a link is removed its parent pointer is not changed and will * be invalid. There are two cases where a link is removed but the * file stays around, when it goes to the delete queue and when there * are additional links. */ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); if (error != 0) return (error); error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); if (error != 0) return (error); *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); /* * Extended attributes can be applied to files, directories, etc. * Otherwise the parent must be a directory. */ if (!*is_xattrdir && !S_ISDIR(parent_mode)) return (SET_ERROR(EINVAL)); *pobjp = parent; return (0); } /* * Given an object number, return some zpl level statistics */ static int zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, zfs_stat_t *sb) { sa_bulk_attr_t bulk[4]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &sb->zs_mode, sizeof (sb->zs_mode)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, &sb->zs_gen, sizeof (sb->zs_gen)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, &sb->zs_links, sizeof (sb->zs_links)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, &sb->zs_ctime, sizeof (sb->zs_ctime)); return (sa_bulk_lookup(hdl, bulk, count)); } static int zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, sa_attr_type_t *sa_table, char *buf, int len) { sa_handle_t *sa_hdl; sa_handle_t *prevhdl = NULL; dmu_buf_t *prevdb = NULL; dmu_buf_t *sa_db = NULL; char *path = buf + len - 1; int error; *path = '\0'; sa_hdl = hdl; uint64_t deleteq_obj; VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); error = zap_lookup_int(osp, deleteq_obj, obj); if (error == 0) { return (ESTALE); } else if (error != ENOENT) { return (error); } for (;;) { uint64_t pobj = 0; char component[MAXNAMELEN + 2]; size_t complen; int is_xattrdir = 0; if (prevdb) { ASSERT(prevhdl != NULL); zfs_release_sa_handle(prevhdl, prevdb, FTAG); } if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; if (pobj == obj) { if (path[0] != '/') *--path = '/'; break; } component[0] = '/'; if (is_xattrdir) { strcpy(component + 1, ""); } else { error = zap_value_search(osp, pobj, obj, ZFS_DIRENT_OBJ(-1ULL), component + 1); if (error != 0) break; } complen = strlen(component); path -= complen; ASSERT(path >= buf); memcpy(path, component, complen); obj = pobj; if (sa_hdl != hdl) { prevhdl = sa_hdl; prevdb = sa_db; } error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); if (error != 0) { sa_hdl = prevhdl; sa_db = prevdb; break; } } if (sa_hdl != NULL && sa_hdl != hdl) { ASSERT(sa_db != NULL); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); } if (error == 0) (void) memmove(buf, path, buf + len - path); return (error); } int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, char *buf, int len) { char *path = buf + len - 1; sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; *path = '\0'; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_stats_impl(hdl, sa_table, sb); if (error != 0) { zfs_release_sa_handle(hdl, db, FTAG); return (error); } error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { uint64_t *cached_copy = NULL; /* * Figure out where in the objset_t the cached copy would live, if it * is available for the requested property. */ if (os != NULL) { switch (prop) { case ZFS_PROP_VERSION: cached_copy = &os->os_version; break; case ZFS_PROP_NORMALIZE: cached_copy = &os->os_normalization; break; case ZFS_PROP_UTF8ONLY: cached_copy = &os->os_utf8only; break; case ZFS_PROP_CASE: cached_copy = &os->os_casesensitivity; break; default: break; } } if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { *value = *cached_copy; return (0); } /* * If the property wasn't cached, look up the file system's value for * the property. For the version property, we look up a slightly * different string. */ const char *pname; int error = ENOENT; if (prop == ZFS_PROP_VERSION) pname = ZPL_VERSION_STR; else pname = zfs_prop_to_name(prop); if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); } if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; case ZFS_PROP_ACLTYPE: *value = ZFS_ACLTYPE_OFF; break; default: return (error); } error = 0; } /* * If one of the methods for getting the property value above worked, * copy it into the objset_t's cache. */ if (error == 0 && cached_copy != NULL) { *cached_copy = *value; } return (error); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); /* CSTYLED */ module_param(zfs_object_mutex_size, uint, 0644); MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); module_param(zfs_unlink_suspend_progress, int, 0644); MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " "(debug - leaks space into the unlinked set)"); #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c index 7786444fea35..8ee7fcecc7b7 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c @@ -1,664 +1,673 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2011 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * LLNL-CODE-403049. * Rewritten for Linux by: * Rohan Puri * Brian Behlendorf */ #include #include #include #include #include #include #include #include /* * Common open routine. Disallow any write access. */ static int zpl_common_open(struct inode *ip, struct file *filp) { if (blk_mode_is_open_write(filp->f_mode)) return (-EACCES); return (generic_file_open(ip, filp)); } /* * Get root directory contents. */ static int zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) { zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); int error = 0; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); if (!zpl_dir_emit_dots(filp, ctx)) goto out; if (ctx->pos == 2) { if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME, strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR)) goto out; ctx->pos++; } if (ctx->pos == 3) { if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME, strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR)) goto out; ctx->pos++; } out: zpl_exit(zfsvfs, FTAG); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_root_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ /* * Get root directory attributes. */ static int #ifdef HAVE_IDMAP_IOPS_GETATTR zpl_root_getattr_impl(struct mnt_idmap *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #elif defined(HAVE_USERNS_IOPS_GETATTR) zpl_root_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #else zpl_root_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #endif { (void) request_mask, (void) query_flags; struct inode *ip = path->dentry->d_inode; #if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, ip, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif #else generic_fillattr(ip, stat); #endif stat->atime = current_time(ip); return (0); } ZPL_GETATTR_WRAPPER(zpl_root_getattr); static struct dentry * zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) { cred_t *cr = CRED(); struct inode *ip; int error; crhold(cr); error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); ASSERT3S(error, <=, 0); crfree(cr); if (error) { if (error == -ENOENT) return (d_splice_alias(NULL, dentry)); else return (ERR_PTR(error)); } return (d_splice_alias(ip, dentry)); } /* * The '.zfs' control directory file and inode operations. */ const struct file_operations zpl_fops_root = { .open = zpl_common_open, .llseek = generic_file_llseek, .read = generic_read_dir, #ifdef HAVE_VFS_ITERATE_SHARED .iterate_shared = zpl_root_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_root_iterate, #else .readdir = zpl_root_readdir, #endif }; const struct inode_operations zpl_ops_root = { .lookup = zpl_root_lookup, .getattr = zpl_root_getattr, }; static struct vfsmount * zpl_snapdir_automount(struct path *path) { int error; error = -zfsctl_snapshot_mount(path, 0); if (error) return (ERR_PTR(error)); /* * Rather than returning the new vfsmount for the snapshot we must * return NULL to indicate a mount collision. This is done because * the user space mount calls do_add_mount() which adds the vfsmount * to the name space. If we returned the new mount here it would be * added again to the vfsmount list resulting in list corruption. */ return (NULL); } /* * Negative dentries must always be revalidated so newly created snapshots * can be detected and automounted. Normal dentries should be kept because * as of the 3.18 kernel revaliding the mountpoint dentry will result in * the snapshot being immediately unmounted. */ static int #ifdef HAVE_D_REVALIDATE_NAMEIDATA zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i) #else zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) #endif { return (!!dentry->d_inode); } static dentry_operations_t zpl_dops_snapdirs = { /* * Auto mounting of snapshots is only supported for 2.6.37 and * newer kernels. Prior to this kernel the ops->follow_link() * callback was used as a hack to trigger the mount. The * resulting vfsmount was then explicitly grafted in to the * name space. While it might be possible to add compatibility * code to accomplish this it would require considerable care. */ .d_automount = zpl_snapdir_automount, .d_revalidate = zpl_snapdir_revalidate, }; static struct dentry * zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) { fstrans_cookie_t cookie; cred_t *cr = CRED(); struct inode *ip = NULL; int error; crhold(cr); cookie = spl_fstrans_mark(); error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); ASSERT3S(error, <=, 0); spl_fstrans_unmark(cookie); crfree(cr); if (error && error != -ENOENT) return (ERR_PTR(error)); ASSERT(error == 0 || ip == NULL); d_clear_d_op(dentry); d_set_d_op(dentry, &zpl_dops_snapdirs); dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; return (d_splice_alias(ip, dentry)); } static int zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) { zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); fstrans_cookie_t cookie; char snapname[MAXNAMELEN]; boolean_t case_conflict; uint64_t id, pos; int error = 0; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); cookie = spl_fstrans_mark(); if (!zpl_dir_emit_dots(filp, ctx)) goto out; /* Start the position at 0 if it already emitted . and .. */ pos = (ctx->pos == 2 ? 0 : ctx->pos); while (error == 0) { dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, &pos, &case_conflict); dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); if (error) goto out; if (!zpl_dir_emit(ctx, snapname, strlen(snapname), ZFSCTL_INO_SHARES - id, DT_DIR)) goto out; ctx->pos = pos; } out: spl_fstrans_unmark(cookie); zpl_exit(zfsvfs, FTAG); if (error == -ENOENT) return (0); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_snapdir_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ static int #ifdef HAVE_IOPS_RENAME_USERNS zpl_snapdir_rename2(struct user_namespace *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) #elif defined(HAVE_IOPS_RENAME_IDMAP) zpl_snapdir_rename2(struct mnt_idmap *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) #else zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) #endif { cred_t *cr = CRED(); int error; /* We probably don't want to support renameat2(2) in ctldir */ if (flags) return (-EINVAL); crhold(cr); error = -zfsctl_snapdir_rename(sdip, dname(sdentry), tdip, dname(tdentry), cr, 0); ASSERT3S(error, <=, 0); crfree(cr); return (error); } #if (!defined(HAVE_RENAME_WANTS_FLAGS) && \ !defined(HAVE_IOPS_RENAME_USERNS) && \ !defined(HAVE_IOPS_RENAME_IDMAP)) static int zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) { return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0)); } #endif static int zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry) { cred_t *cr = CRED(); int error; crhold(cr); error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0); ASSERT3S(error, <=, 0); crfree(cr); return (error); } static int #ifdef HAVE_IOPS_MKDIR_USERNS zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip, struct dentry *dentry, umode_t mode) #elif defined(HAVE_IOPS_MKDIR_IDMAP) zpl_snapdir_mkdir(struct mnt_idmap *user_ns, struct inode *dip, struct dentry *dentry, umode_t mode) #else zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) #endif { cred_t *cr = CRED(); vattr_t *vap; struct inode *ip; int error; crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); #if (defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP)) zpl_vap_init(vap, dip, mode | S_IFDIR, cr, user_ns); #else zpl_vap_init(vap, dip, mode | S_IFDIR, cr, zfs_init_idmap); #endif error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); if (error == 0) { d_clear_d_op(dentry); d_set_d_op(dentry, &zpl_dops_snapdirs); d_instantiate(dentry, ip); } kmem_free(vap, sizeof (vattr_t)); ASSERT3S(error, <=, 0); crfree(cr); return (error); } /* * Get snapshot directory attributes. */ static int #ifdef HAVE_IDMAP_IOPS_GETATTR zpl_snapdir_getattr_impl(struct mnt_idmap *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #elif defined(HAVE_USERNS_IOPS_GETATTR) zpl_snapdir_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #else zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #endif { (void) request_mask, (void) query_flags; struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); #if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, ip, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif #else generic_fillattr(ip, stat); #endif stat->nlink = stat->size = 2; dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os); if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { uint64_t snap_count; int err = zap_count( dmu_objset_pool(ds->ds_objset)->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (err != 0) { zpl_exit(zfsvfs, FTAG); return (-err); } stat->nlink += snap_count; } stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); stat->atime = current_time(ip); zpl_exit(zfsvfs, FTAG); return (0); } ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr); /* * The '.zfs/snapshot' directory file operations. These mainly control * generating the list of available snapshots when doing an 'ls' in the * directory. See zpl_snapdir_readdir(). */ const struct file_operations zpl_fops_snapdir = { .open = zpl_common_open, .llseek = generic_file_llseek, .read = generic_read_dir, #ifdef HAVE_VFS_ITERATE_SHARED .iterate_shared = zpl_snapdir_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_snapdir_iterate, #else .readdir = zpl_snapdir_readdir, #endif }; /* * The '.zfs/snapshot' directory inode operations. These mainly control * creating an inode for a snapshot directory and initializing the needed * infrastructure to automount the snapshot. See zpl_snapdir_lookup(). */ const struct inode_operations zpl_ops_snapdir = { .lookup = zpl_snapdir_lookup, .getattr = zpl_snapdir_getattr, #if (defined(HAVE_RENAME_WANTS_FLAGS) || \ defined(HAVE_IOPS_RENAME_USERNS) || \ defined(HAVE_IOPS_RENAME_IDMAP)) .rename = zpl_snapdir_rename2, #else .rename = zpl_snapdir_rename, #endif .rmdir = zpl_snapdir_rmdir, .mkdir = zpl_snapdir_mkdir, }; static struct dentry * zpl_shares_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) { fstrans_cookie_t cookie; cred_t *cr = CRED(); struct inode *ip = NULL; int error; crhold(cr); cookie = spl_fstrans_mark(); error = -zfsctl_shares_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); ASSERT3S(error, <=, 0); spl_fstrans_unmark(cookie); crfree(cr); if (error) { if (error == -ENOENT) return (d_splice_alias(NULL, dentry)); else return (ERR_PTR(error)); } return (d_splice_alias(ip, dentry)); } static int zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) { fstrans_cookie_t cookie; cred_t *cr = CRED(); zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); znode_t *dzp; int error = 0; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); cookie = spl_fstrans_mark(); if (zfsvfs->z_shares_dir == 0) { zpl_dir_emit_dots(filp, ctx); goto out; } error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error) goto out; crhold(cr); error = -zfs_readdir(ZTOI(dzp), ctx, cr); crfree(cr); iput(ZTOI(dzp)); out: spl_fstrans_unmark(cookie); zpl_exit(zfsvfs, FTAG); ASSERT3S(error, <=, 0); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_shares_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ static int #ifdef HAVE_USERNS_IOPS_GETATTR zpl_shares_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #elif defined(HAVE_IDMAP_IOPS_GETATTR) zpl_shares_getattr_impl(struct mnt_idmap *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #else zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #endif { (void) request_mask, (void) query_flags; struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); znode_t *dzp; int error; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); if (zfsvfs->z_shares_dir == 0) { #if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, path->dentry->d_inode, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, path->dentry->d_inode, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif #else generic_fillattr(path->dentry->d_inode, stat); #endif stat->nlink = stat->size = 2; stat->atime = current_time(ip); zpl_exit(zfsvfs, FTAG); return (0); } error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error == 0) { -#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + error = -zfs_getattr_fast(user_ns, request_mask, ZTOI(dzp), + stat); +#elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat); #else error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat); #endif iput(ZTOI(dzp)); } zpl_exit(zfsvfs, FTAG); ASSERT3S(error, <=, 0); return (error); } ZPL_GETATTR_WRAPPER(zpl_shares_getattr); /* * The '.zfs/shares' directory file operations. */ const struct file_operations zpl_fops_shares = { .open = zpl_common_open, .llseek = generic_file_llseek, .read = generic_read_dir, #ifdef HAVE_VFS_ITERATE_SHARED .iterate_shared = zpl_shares_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_shares_iterate, #else .readdir = zpl_shares_readdir, #endif }; /* * The '.zfs/shares' directory inode operations. */ const struct inode_operations zpl_ops_shares = { .lookup = zpl_shares_lookup, .getattr = zpl_shares_getattr, }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c index c47fe99dacff..139c51cf46df 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c @@ -1,276 +1,281 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2023, Klara Inc. */ #ifdef CONFIG_COMPAT #include #endif #include #include #include #include #include +int zfs_bclone_enabled = 0; + /* * Clone part of a file via block cloning. * * Note that we are not required to update file offsets; the kernel will take * care of that depending on how it was called. */ static ssize_t __zpl_clone_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len) { struct inode *src_i = file_inode(src_file); struct inode *dst_i = file_inode(dst_file); uint64_t src_off_o = (uint64_t)src_off; uint64_t dst_off_o = (uint64_t)dst_off; uint64_t len_o = (uint64_t)len; cred_t *cr = CRED(); fstrans_cookie_t cookie; int err; + if (!zfs_bclone_enabled) + return (-EOPNOTSUPP); + if (!spa_feature_is_enabled( dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING)) return (-EOPNOTSUPP); if (src_i != dst_i) spl_inode_lock_shared(src_i); spl_inode_lock(dst_i); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i), &dst_off_o, &len_o, cr); spl_fstrans_unmark(cookie); crfree(cr); spl_inode_unlock(dst_i); if (src_i != dst_i) spl_inode_unlock_shared(src_i); if (err < 0) return (err); return ((ssize_t)len_o); } #if defined(HAVE_VFS_COPY_FILE_RANGE) || \ defined(HAVE_VFS_FILE_OPERATIONS_EXTEND) /* * Entry point for copy_file_range(). Copy len bytes from src_off in src_file * to dst_off in dst_file. We are permitted to do this however we like, so we * try to just clone the blocks, and if we can't support it, fall back to the * kernel's generic byte copy function. */ ssize_t zpl_copy_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags) { ssize_t ret; if (flags != 0) return (-EINVAL); /* Try to do it via zfs_clone_range() */ ret = __zpl_clone_file_range(src_file, src_off, dst_file, dst_off, len); #ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE /* * Since Linux 5.3 the filesystem driver is responsible for executing * an appropriate fallback, and a generic fallback function is provided. */ if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN) ret = generic_copy_file_range(src_file, src_off, dst_file, dst_off, len, flags); #else /* * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal * to the kernel that it should fallback to a content copy. */ if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN) ret = -EOPNOTSUPP; #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ return (ret); } #endif /* HAVE_VFS_COPY_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */ #ifdef HAVE_VFS_REMAP_FILE_RANGE /* * Entry point for FICLONE/FICLONERANGE/FIDEDUPERANGE. * * FICLONE and FICLONERANGE are basically the same as copy_file_range(), except * that they must clone - they cannot fall back to copying. FICLONE is exactly * FICLONERANGE, for the entire file. We don't need to try to tell them apart; * the kernel will sort that out for us. * * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the * range in both files and if they're the same, arrange for them to be backed * by the same storage. */ loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags) { if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)) return (-EINVAL); /* * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given * range if we want. Its designed for filesystems that make data past * EOF available, and don't want it to be visible in both files. ZFS * doesn't do that, so we just turn the flag off. */ flags &= ~REMAP_FILE_CAN_SHORTEN; if (flags & REMAP_FILE_DEDUP) /* No support for dedup yet */ return (-EOPNOTSUPP); /* Zero length means to clone everything to the end of the file */ if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; return (__zpl_clone_file_range(src_file, src_off, dst_file, dst_off, len)); } #endif /* HAVE_VFS_REMAP_FILE_RANGE */ #if defined(HAVE_VFS_CLONE_FILE_RANGE) || \ defined(HAVE_VFS_FILE_OPERATIONS_EXTEND) /* * Entry point for FICLONE and FICLONERANGE, before Linux 4.20. */ int zpl_clone_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, uint64_t len) { /* Zero length means to clone everything to the end of the file */ if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; return (__zpl_clone_file_range(src_file, src_off, dst_file, dst_off, len)); } #endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */ #ifdef HAVE_VFS_DEDUPE_FILE_RANGE /* * Entry point for FIDEDUPERANGE, before Linux 4.20. */ int zpl_dedupe_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, uint64_t len) { /* No support for dedup yet */ return (-EOPNOTSUPP); } #endif /* HAVE_VFS_DEDUPE_FILE_RANGE */ /* Entry point for FICLONE, before Linux 4.5. */ long zpl_ioctl_ficlone(struct file *dst_file, void *arg) { unsigned long sfd = (unsigned long)arg; struct file *src_file = fget(sfd); if (src_file == NULL) return (-EBADF); if (dst_file->f_op != src_file->f_op) { fput(src_file); return (-EXDEV); } size_t len = i_size_read(file_inode(src_file)); ssize_t ret = __zpl_clone_file_range(src_file, 0, dst_file, 0, len); fput(src_file); if (ret < 0) { if (ret == -EOPNOTSUPP) return (-ENOTTY); return (ret); } if (ret != len) return (-EINVAL); return (0); } /* Entry point for FICLONERANGE, before Linux 4.5. */ long zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) { zfs_ioc_compat_file_clone_range_t fcr; if (copy_from_user(&fcr, arg, sizeof (fcr))) return (-EFAULT); struct file *src_file = fget(fcr.fcr_src_fd); if (src_file == NULL) return (-EBADF); if (dst_file->f_op != src_file->f_op) { fput(src_file); return (-EXDEV); } size_t len = fcr.fcr_src_length; if (len == 0) len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset; ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset, dst_file, fcr.fcr_dest_offset, len); fput(src_file); if (ret < 0) { if (ret == -EOPNOTSUPP) return (-ENOTTY); return (ret); } if (ret != len) return (-EINVAL); return (0); } /* Entry point for FIDEDUPERANGE, before Linux 4.5. */ long zpl_ioctl_fideduperange(struct file *filp, void *arg) { (void) arg; /* No support for dedup yet */ return (-ENOTTY); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c index 5f5ad186a61c..96f65b9e94e2 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c @@ -1,909 +1,911 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #include #include #include #include #include #include #include #include #include static struct dentry * zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { cred_t *cr = CRED(); struct inode *ip; znode_t *zp; int error; fstrans_cookie_t cookie; pathname_t *ppn = NULL; pathname_t pn; int zfs_flags = 0; zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; if (dlen(dentry) >= ZAP_MAXNAMELEN) return (ERR_PTR(-ENAMETOOLONG)); crhold(cr); cookie = spl_fstrans_mark(); /* If we are a case insensitive fs, we need the real name */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { zfs_flags = FIGNORECASE; pn_alloc(&pn); ppn = &pn; } error = -zfs_lookup(ITOZ(dir), dname(dentry), &zp, zfs_flags, cr, NULL, ppn); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); crfree(cr); spin_lock(&dentry->d_lock); dentry->d_time = jiffies; spin_unlock(&dentry->d_lock); if (error) { /* * If we have a case sensitive fs, we do not want to * insert negative entries, so return NULL for ENOENT. * Fall through if the error is not ENOENT. Also free memory. */ if (ppn) { pn_free(ppn); if (error == -ENOENT) return (NULL); } if (error == -ENOENT) return (d_splice_alias(NULL, dentry)); else return (ERR_PTR(error)); } ip = ZTOI(zp); /* * If we are case insensitive, call the correct function * to install the name. */ if (ppn) { struct dentry *new_dentry; struct qstr ci_name; if (strcmp(dname(dentry), pn.pn_buf) == 0) { new_dentry = d_splice_alias(ip, dentry); } else { ci_name.name = pn.pn_buf; ci_name.len = strlen(pn.pn_buf); new_dentry = d_add_ci(dentry, ip, &ci_name); } pn_free(ppn); return (new_dentry); } else { return (d_splice_alias(ip, dentry)); } } void zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr, zidmap_t *mnt_ns) { vap->va_mask = ATTR_MODE; vap->va_mode = mode; vap->va_uid = zfs_vfsuid_to_uid(mnt_ns, zfs_i_user_ns(dir), crgetuid(cr)); if (dir->i_mode & S_ISGID) { vap->va_gid = KGID_TO_SGID(dir->i_gid); if (S_ISDIR(mode)) vap->va_mode |= S_ISGID; } else { vap->va_gid = zfs_vfsgid_to_gid(mnt_ns, zfs_i_user_ns(dir), crgetgid(cr)); } } static int #ifdef HAVE_IOPS_CREATE_USERNS zpl_create(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) #elif defined(HAVE_IOPS_CREATE_IDMAP) zpl_create(struct mnt_idmap *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) #else zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) #endif { cred_t *cr = CRED(); znode_t *zp; vattr_t *vap; int error; fstrans_cookie_t cookie; #if !(defined(HAVE_IOPS_CREATE_USERNS) || defined(HAVE_IOPS_CREATE_IDMAP)) zidmap_t *user_ns = kcred->user_ns; #endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, mode, cr, user_ns); cookie = spl_fstrans_mark(); error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, mode, &zp, cr, 0, NULL, user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) error = zpl_init_acl(ZTOI(zp), dir); if (error) { (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); remove_inode_hash(ZTOI(zp)); iput(ZTOI(zp)); } else { d_instantiate(dentry, ZTOI(zp)); } } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int #ifdef HAVE_IOPS_MKNOD_USERNS zpl_mknod(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode, #elif defined(HAVE_IOPS_MKNOD_IDMAP) zpl_mknod(struct mnt_idmap *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode, #else zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, #endif dev_t rdev) { cred_t *cr = CRED(); znode_t *zp; vattr_t *vap; int error; fstrans_cookie_t cookie; #if !(defined(HAVE_IOPS_MKNOD_USERNS) || defined(HAVE_IOPS_MKNOD_IDMAP)) zidmap_t *user_ns = kcred->user_ns; #endif /* * We currently expect Linux to supply rdev=0 for all sockets * and fifos, but we want to know if this behavior ever changes. */ if (S_ISSOCK(mode) || S_ISFIFO(mode)) ASSERT(rdev == 0); crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, mode, cr, user_ns); vap->va_rdev = rdev; cookie = spl_fstrans_mark(); error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, mode, &zp, cr, 0, NULL, user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) error = zpl_init_acl(ZTOI(zp), dir); if (error) { (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); remove_inode_hash(ZTOI(zp)); iput(ZTOI(zp)); } else { d_instantiate(dentry, ZTOI(zp)); } } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_TMPFILE static int #ifdef HAVE_TMPFILE_IDMAP zpl_tmpfile(struct mnt_idmap *userns, struct inode *dir, struct file *file, umode_t mode) #elif !defined(HAVE_TMPFILE_DENTRY) zpl_tmpfile(struct user_namespace *userns, struct inode *dir, struct file *file, umode_t mode) #else #ifdef HAVE_TMPFILE_USERNS zpl_tmpfile(struct user_namespace *userns, struct inode *dir, struct dentry *dentry, umode_t mode) #else zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) #endif #endif { cred_t *cr = CRED(); struct inode *ip; vattr_t *vap; int error; fstrans_cookie_t cookie; #if !(defined(HAVE_TMPFILE_USERNS) || defined(HAVE_TMPFILE_IDMAP)) zidmap_t *userns = kcred->user_ns; #endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); /* * The VFS does not apply the umask, therefore it is applied here * when POSIX ACLs are not enabled. */ if (!IS_POSIXACL(dir)) mode &= ~current_umask(); zpl_vap_init(vap, dir, mode, cr, userns); cookie = spl_fstrans_mark(); error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL, userns); if (error == 0) { /* d_tmpfile will do drop_nlink, so we should set it first */ set_nlink(ip, 1); #ifndef HAVE_TMPFILE_DENTRY d_tmpfile(file, ip); error = zpl_xattr_security_init(ip, dir, &file->f_path.dentry->d_name); #else d_tmpfile(dentry, ip); error = zpl_xattr_security_init(ip, dir, &dentry->d_name); #endif if (error == 0) error = zpl_init_acl(ip, dir); #ifndef HAVE_TMPFILE_DENTRY error = finish_open_simple(file, error); #endif /* * don't need to handle error here, file is already in * unlinked set. */ } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #endif static int zpl_unlink(struct inode *dir, struct dentry *dentry) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_remove(ITOZ(dir), dname(dentry), cr, 0); /* * For a CI FS we must invalidate the dentry to prevent the * creation of negative entries. */ if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) d_invalidate(dentry); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int #ifdef HAVE_IOPS_MKDIR_USERNS zpl_mkdir(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode) #elif defined(HAVE_IOPS_MKDIR_IDMAP) zpl_mkdir(struct mnt_idmap *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode) #else zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) #endif { cred_t *cr = CRED(); vattr_t *vap; znode_t *zp; int error; fstrans_cookie_t cookie; #if !(defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP)) zidmap_t *user_ns = kcred->user_ns; #endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, mode | S_IFDIR, cr, user_ns); cookie = spl_fstrans_mark(); error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL, user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) error = zpl_init_acl(ZTOI(zp), dir); if (error) { (void) zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0); remove_inode_hash(ZTOI(zp)); iput(ZTOI(zp)); } else { d_instantiate(dentry, ZTOI(zp)); } } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_rmdir(struct inode *dir, struct dentry *dentry) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0); /* * For a CI FS we must invalidate the dentry to prevent the * creation of negative entries. */ if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) d_invalidate(dentry); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int #ifdef HAVE_USERNS_IOPS_GETATTR zpl_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #elif defined(HAVE_IDMAP_IOPS_GETATTR) zpl_getattr_impl(struct mnt_idmap *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #else zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #endif { int error; fstrans_cookie_t cookie; struct inode *ip = path->dentry->d_inode; znode_t *zp __maybe_unused = ITOZ(ip); cookie = spl_fstrans_mark(); /* * XXX query_flags currently ignored. */ -#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + error = -zfs_getattr_fast(user_ns, request_mask, ip, stat); +#elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) error = -zfs_getattr_fast(user_ns, ip, stat); #else error = -zfs_getattr_fast(kcred->user_ns, ip, stat); #endif #ifdef STATX_BTIME if (request_mask & STATX_BTIME) { stat->btime = zp->z_btime; stat->result_mask |= STATX_BTIME; } #endif #ifdef STATX_ATTR_IMMUTABLE if (zp->z_pflags & ZFS_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; stat->attributes_mask |= STATX_ATTR_IMMUTABLE; #endif #ifdef STATX_ATTR_APPEND if (zp->z_pflags & ZFS_APPENDONLY) stat->attributes |= STATX_ATTR_APPEND; stat->attributes_mask |= STATX_ATTR_APPEND; #endif #ifdef STATX_ATTR_NODUMP if (zp->z_pflags & ZFS_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= STATX_ATTR_NODUMP; #endif spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); return (error); } ZPL_GETATTR_WRAPPER(zpl_getattr); static int #ifdef HAVE_USERNS_IOPS_SETATTR zpl_setattr(struct user_namespace *user_ns, struct dentry *dentry, struct iattr *ia) #elif defined(HAVE_IDMAP_IOPS_SETATTR) zpl_setattr(struct mnt_idmap *user_ns, struct dentry *dentry, struct iattr *ia) #else zpl_setattr(struct dentry *dentry, struct iattr *ia) #endif { struct inode *ip = dentry->d_inode; cred_t *cr = CRED(); vattr_t *vap; int error; fstrans_cookie_t cookie; #ifdef HAVE_SETATTR_PREPARE_USERNS error = zpl_setattr_prepare(user_ns, dentry, ia); #elif defined(HAVE_SETATTR_PREPARE_IDMAP) error = zpl_setattr_prepare(user_ns, dentry, ia); #else error = zpl_setattr_prepare(zfs_init_idmap, dentry, ia); #endif if (error) return (error); crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK; vap->va_mode = ia->ia_mode; if (ia->ia_valid & ATTR_UID) #ifdef HAVE_IATTR_VFSID vap->va_uid = zfs_vfsuid_to_uid(user_ns, zfs_i_user_ns(ip), __vfsuid_val(ia->ia_vfsuid)); #else vap->va_uid = KUID_TO_SUID(ia->ia_uid); #endif if (ia->ia_valid & ATTR_GID) #ifdef HAVE_IATTR_VFSID vap->va_gid = zfs_vfsgid_to_gid(user_ns, zfs_i_user_ns(ip), __vfsgid_val(ia->ia_vfsgid)); #else vap->va_gid = KGID_TO_SGID(ia->ia_gid); #endif vap->va_size = ia->ia_size; vap->va_atime = ia->ia_atime; vap->va_mtime = ia->ia_mtime; vap->va_ctime = ia->ia_ctime; if (vap->va_mask & ATTR_ATIME) ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip); cookie = spl_fstrans_mark(); #ifdef HAVE_USERNS_IOPS_SETATTR error = -zfs_setattr(ITOZ(ip), vap, 0, cr, user_ns); #elif defined(HAVE_IDMAP_IOPS_SETATTR) error = -zfs_setattr(ITOZ(ip), vap, 0, cr, user_ns); #else error = -zfs_setattr(ITOZ(ip), vap, 0, cr, zfs_init_idmap); #endif if (!error && (ia->ia_valid & ATTR_MODE)) error = zpl_chmod_acl(ip); spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int #ifdef HAVE_IOPS_RENAME_USERNS zpl_rename2(struct user_namespace *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int rflags) #elif defined(HAVE_IOPS_RENAME_IDMAP) zpl_rename2(struct mnt_idmap *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int rflags) #else zpl_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int rflags) #endif { cred_t *cr = CRED(); vattr_t *wo_vap = NULL; int error; fstrans_cookie_t cookie; #if !(defined(HAVE_IOPS_RENAME_USERNS) || defined(HAVE_IOPS_RENAME_IDMAP)) zidmap_t *user_ns = kcred->user_ns; #endif crhold(cr); if (rflags & RENAME_WHITEOUT) { wo_vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(wo_vap, sdip, S_IFCHR, cr, user_ns); wo_vap->va_rdev = makedevice(0, 0); } cookie = spl_fstrans_mark(); error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip), dname(tdentry), cr, 0, rflags, wo_vap, user_ns); spl_fstrans_unmark(cookie); if (wo_vap) kmem_free(wo_vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #if !defined(HAVE_IOPS_RENAME_USERNS) && \ !defined(HAVE_RENAME_WANTS_FLAGS) && \ !defined(HAVE_RENAME2) && \ !defined(HAVE_IOPS_RENAME_IDMAP) static int zpl_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) { return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0)); } #endif static int #ifdef HAVE_IOPS_SYMLINK_USERNS zpl_symlink(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, const char *name) #elif defined(HAVE_IOPS_SYMLINK_IDMAP) zpl_symlink(struct mnt_idmap *user_ns, struct inode *dir, struct dentry *dentry, const char *name) #else zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) #endif { cred_t *cr = CRED(); vattr_t *vap; znode_t *zp; int error; fstrans_cookie_t cookie; #if !(defined(HAVE_IOPS_SYMLINK_USERNS) || defined(HAVE_IOPS_SYMLINK_IDMAP)) zidmap_t *user_ns = kcred->user_ns; #endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr, user_ns); cookie = spl_fstrans_mark(); error = -zfs_symlink(ITOZ(dir), dname(dentry), vap, (char *)name, &zp, cr, 0, user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error) { (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); remove_inode_hash(ZTOI(zp)); iput(ZTOI(zp)); } else { d_instantiate(dentry, ZTOI(zp)); } } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #if defined(HAVE_PUT_LINK_COOKIE) static void zpl_put_link(struct inode *unused, void *cookie) { kmem_free(cookie, MAXPATHLEN); } #elif defined(HAVE_PUT_LINK_NAMEIDATA) static void zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) { const char *link = nd_get_link(nd); if (!IS_ERR(link)) kmem_free(link, MAXPATHLEN); } #elif defined(HAVE_PUT_LINK_DELAYED) static void zpl_put_link(void *ptr) { kmem_free(ptr, MAXPATHLEN); } #endif static int zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link) { fstrans_cookie_t cookie; cred_t *cr = CRED(); int error; crhold(cr); *link = NULL; struct iovec iov; iov.iov_len = MAXPATHLEN; iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP); zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, MAXPATHLEN - 1, 0); cookie = spl_fstrans_mark(); error = -zfs_readlink(ip, &uio, cr); spl_fstrans_unmark(cookie); crfree(cr); if (error) kmem_free(iov.iov_base, MAXPATHLEN); else *link = iov.iov_base; return (error); } #if defined(HAVE_GET_LINK_DELAYED) static const char * zpl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { char *link = NULL; int error; if (!dentry) return (ERR_PTR(-ECHILD)); error = zpl_get_link_common(dentry, inode, &link); if (error) return (ERR_PTR(error)); set_delayed_call(done, zpl_put_link, link); return (link); } #elif defined(HAVE_GET_LINK_COOKIE) static const char * zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie) { char *link = NULL; int error; if (!dentry) return (ERR_PTR(-ECHILD)); error = zpl_get_link_common(dentry, inode, &link); if (error) return (ERR_PTR(error)); return (*cookie = link); } #elif defined(HAVE_FOLLOW_LINK_COOKIE) static const char * zpl_follow_link(struct dentry *dentry, void **cookie) { char *link = NULL; int error; error = zpl_get_link_common(dentry, dentry->d_inode, &link); if (error) return (ERR_PTR(error)); return (*cookie = link); } #elif defined(HAVE_FOLLOW_LINK_NAMEIDATA) static void * zpl_follow_link(struct dentry *dentry, struct nameidata *nd) { char *link = NULL; int error; error = zpl_get_link_common(dentry, dentry->d_inode, &link); if (error) nd_set_link(nd, ERR_PTR(error)); else nd_set_link(nd, link); return (NULL); } #endif static int zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { cred_t *cr = CRED(); struct inode *ip = old_dentry->d_inode; int error; fstrans_cookie_t cookie; if (ip->i_nlink >= ZFS_LINK_MAX) return (-EMLINK); crhold(cr); - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); /* Must have an existing ref, so igrab() cannot return NULL */ VERIFY3P(igrab(ip), !=, NULL); cookie = spl_fstrans_mark(); error = -zfs_link(ITOZ(dir), ITOZ(ip), dname(dentry), cr, 0); if (error) { iput(ip); goto out; } d_instantiate(dentry, ip); out: spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } const struct inode_operations zpl_inode_operations = { .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif /* HAVE_SET_ACL */ #if defined(HAVE_GET_INODE_ACL) .get_inode_acl = zpl_get_acl, #else .get_acl = zpl_get_acl, #endif /* HAVE_GET_INODE_ACL */ #endif /* CONFIG_FS_POSIX_ACL */ }; #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER const struct inode_operations_wrapper zpl_dir_inode_operations = { .ops = { #else const struct inode_operations zpl_dir_inode_operations = { #endif .create = zpl_create, .lookup = zpl_lookup, .link = zpl_link, .unlink = zpl_unlink, .symlink = zpl_symlink, .mkdir = zpl_mkdir, .rmdir = zpl_rmdir, .mknod = zpl_mknod, #ifdef HAVE_RENAME2 .rename2 = zpl_rename2, #elif defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) .rename = zpl_rename2, #elif defined(HAVE_IOPS_RENAME_IDMAP) .rename = zpl_rename2, #else .rename = zpl_rename, #endif #ifdef HAVE_TMPFILE .tmpfile = zpl_tmpfile, #endif .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif /* HAVE_SET_ACL */ #if defined(HAVE_GET_INODE_ACL) .get_inode_acl = zpl_get_acl, #else .get_acl = zpl_get_acl, #endif /* HAVE_GET_INODE_ACL */ #endif /* CONFIG_FS_POSIX_ACL */ #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER }, .rename2 = zpl_rename2, #endif }; const struct inode_operations zpl_symlink_inode_operations = { #ifdef HAVE_GENERIC_READLINK .readlink = generic_readlink, #endif #if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE) .get_link = zpl_get_link, #elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA) .follow_link = zpl_follow_link, #endif #if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA) .put_link = zpl_put_link, #endif .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, }; const struct inode_operations zpl_special_inode_operations = { .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif /* HAVE_SET_ACL */ #if defined(HAVE_GET_INODE_ACL) .get_inode_acl = zpl_get_acl, #else .get_acl = zpl_get_acl, #endif /* HAVE_GET_INODE_ACL */ #endif /* CONFIG_FS_POSIX_ACL */ }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index ad52a11aada0..d98d32c1f9fb 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -1,411 +1,411 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2023, Datto Inc. All rights reserved. */ #include #include #include #include #include static struct inode * zpl_inode_alloc(struct super_block *sb) { struct inode *ip; VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); inode_set_iversion(ip, 1); return (ip); } static void zpl_inode_destroy(struct inode *ip) { ASSERT(atomic_read(&ip->i_count) == 0); zfs_inode_destroy(ip); } /* * Called from __mark_inode_dirty() to reflect that something in the * inode has changed. We use it to ensure the znode system attributes * are always strictly update to date with respect to the inode. */ #ifdef HAVE_DIRTY_INODE_WITH_FLAGS static void zpl_dirty_inode(struct inode *ip, int flags) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); zfs_dirty_inode(ip, flags); spl_fstrans_unmark(cookie); } #else static void zpl_dirty_inode(struct inode *ip) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); zfs_dirty_inode(ip, 0); spl_fstrans_unmark(cookie); } #endif /* HAVE_DIRTY_INODE_WITH_FLAGS */ /* * When ->drop_inode() is called its return value indicates if the * inode should be evicted from the inode cache. If the inode is * unhashed and has no links the default policy is to evict it * immediately. * * The ->evict_inode() callback must minimally truncate the inode pages, * and call clear_inode(). For 2.6.35 and later kernels this will * simply update the inode state, with the sync occurring before the * truncate in evict(). For earlier kernels clear_inode() maps to * end_writeback() which is responsible for completing all outstanding * write back. In either case, once this is done it is safe to cleanup * any remaining inode specific data via zfs_inactive(). * remaining filesystem specific data. */ static void zpl_evict_inode(struct inode *ip) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); truncate_setsize(ip, 0); clear_inode(ip); zfs_inactive(ip); spl_fstrans_unmark(cookie); } static void zpl_put_super(struct super_block *sb) { fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_umount(sb); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); } static int zpl_sync_fs(struct super_block *sb, int wait) { fstrans_cookie_t cookie; cred_t *cr = CRED(); int error; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_sync(sb, wait, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_statfs(struct dentry *dentry, struct kstatfs *statp) { fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_statvfs(dentry->d_inode, statp); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); /* * If required by a 32-bit system call, dynamically scale the * block size up to 16MiB and decrease the block counts. This * allows for a maximum size of 64EiB to be reported. The file * counts must be artificially capped at 2^32-1. */ if (unlikely(zpl_is_32bit_api())) { while (statp->f_blocks > UINT32_MAX && statp->f_bsize < SPA_MAXBLOCKSIZE) { statp->f_frsize <<= 1; statp->f_bsize <<= 1; statp->f_blocks >>= 1; statp->f_bfree >>= 1; statp->f_bavail >>= 1; } uint64_t usedobjs = statp->f_files - statp->f_ffree; statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs); statp->f_files = statp->f_ffree + usedobjs; } return (error); } static int zpl_remount_fs(struct super_block *sb, int *flags, char *data) { zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data }; fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_remount(sb, flags, &zm); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); return (error); } static int __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) { int error; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dmu_objset_name(zfsvfs->z_os, fsname); for (int i = 0; fsname[i] != 0; i++) { /* * Spaces in the dataset name must be converted to their * octal escape sequence for getmntent(3) to correctly * parse then fsname portion of /proc/self/mounts. */ if (fsname[i] == ' ') { seq_puts(seq, "\\040"); } else { seq_putc(seq, fsname[i]); } } kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); zpl_exit(zfsvfs, FTAG); return (0); } static int zpl_show_devname(struct seq_file *seq, struct dentry *root) { return (__zpl_show_devname(seq, root->d_sb->s_fs_info)); } static int __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) { seq_printf(seq, ",%s", zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); #ifdef CONFIG_FS_POSIX_ACL switch (zfsvfs->z_acl_type) { case ZFS_ACLTYPE_POSIX: seq_puts(seq, ",posixacl"); break; default: seq_puts(seq, ",noacl"); break; } #endif /* CONFIG_FS_POSIX_ACL */ switch (zfsvfs->z_case) { case ZFS_CASE_SENSITIVE: seq_puts(seq, ",casesensitive"); break; case ZFS_CASE_INSENSITIVE: seq_puts(seq, ",caseinsensitive"); break; default: seq_puts(seq, ",casemixed"); break; } return (0); } static int zpl_show_options(struct seq_file *seq, struct dentry *root) { return (__zpl_show_options(seq, root->d_sb->s_fs_info)); } static int zpl_fill_super(struct super_block *sb, void *data, int silent) { zfs_mnt_t *zm = (zfs_mnt_t *)data; fstrans_cookie_t cookie; int error; cookie = spl_fstrans_mark(); error = -zfs_domount(sb, zm, silent); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); return (error); } static int zpl_test_super(struct super_block *s, void *data) { zfsvfs_t *zfsvfs = s->s_fs_info; objset_t *os = data; /* * If the os doesn't match the z_os in the super_block, assume it is * not a match. Matching would imply a multimount of a dataset. It is * possible that during a multimount, there is a simultaneous operation * that changes the z_os, e.g., rollback, where the match will be * missed, but in that case the user will get an EBUSY. */ return (zfsvfs != NULL && os == zfsvfs->z_os); } static struct super_block * zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) { struct super_block *s; objset_t *os; int err; err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); if (err) return (ERR_PTR(-err)); /* * The dsl pool lock must be released prior to calling sget(). * It is possible sget() may block on the lock in grab_super() * while deactivate_super() holds that same lock and waits for * a txg sync. If the dsl_pool lock is held over sget() * this can prevent the pool sync and cause a deadlock. */ dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); dsl_pool_rele(dmu_objset_pool(os), FTAG); s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); /* * Recheck with the lock held to prevent mounting the wrong dataset * since z_os can be stale when the teardown lock is held. * * We can't do this in zpl_test_super in since it's under spinlock and * also s_umount lock is not held there so it would race with * zfs_umount and zfsvfs can be freed. */ if (!IS_ERR(s) && s->s_fs_info != NULL) { zfsvfs_t *zfsvfs = s->s_fs_info; if (zpl_enter(zfsvfs, FTAG) == 0) { if (os != zfsvfs->z_os) err = -SET_ERROR(EBUSY); zpl_exit(zfsvfs, FTAG); } else { err = -SET_ERROR(EBUSY); } } dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); dsl_dataset_rele(dmu_objset_ds(os), FTAG); if (IS_ERR(s)) return (ERR_CAST(s)); if (err) { deactivate_locked_super(s); return (ERR_PTR(err)); } if (s->s_root == NULL) { err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); if (err) { deactivate_locked_super(s); return (ERR_PTR(err)); } s->s_flags |= SB_ACTIVE; } else if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return (ERR_PTR(-EBUSY)); } return (s); } static struct dentry * zpl_mount(struct file_system_type *fs_type, int flags, const char *osname, void *data) { zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); if (IS_ERR(sb)) return (ERR_CAST(sb)); return (dget(sb->s_root)); } static void zpl_kill_sb(struct super_block *sb) { zfs_preumount(sb); kill_anon_super(sb); } void -zpl_prune_sb(int64_t nr_to_scan, void *arg) +zpl_prune_sb(uint64_t nr_to_scan, void *arg) { struct super_block *sb = (struct super_block *)arg; int objects = 0; (void) -zfs_prune(sb, nr_to_scan, &objects); } const struct super_operations zpl_super_operations = { .alloc_inode = zpl_inode_alloc, .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, .evict_inode = zpl_evict_inode, .put_super = zpl_put_super, .sync_fs = zpl_sync_fs, .statfs = zpl_statfs, .remount_fs = zpl_remount_fs, .show_devname = zpl_show_devname, .show_options = zpl_show_options, .show_stats = NULL, }; struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, #if defined(HAVE_IDMAP_MNT_API) .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, #else .fs_flags = FS_USERNS_MOUNT, #endif .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c index 96d85991811e..4e4f5210f85d 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c @@ -1,1637 +1,1638 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * * Extended attributes (xattr) on Solaris are implemented as files * which exist in a hidden xattr directory. These extended attributes * can be accessed using the attropen() system call which opens * the extended attribute. It can then be manipulated just like * a standard file descriptor. This has a couple advantages such * as practically no size limit on the file, and the extended * attributes permissions may differ from those of the parent file. * This interface is really quite clever, but it's also completely * different than what is supported on Linux. It also comes with a * steep performance penalty when accessing small xattrs because they * are not stored with the parent file. * * Under Linux extended attributes are manipulated by the system * calls getxattr(2), setxattr(2), and listxattr(2). They consider * extended attributes to be name/value pairs where the name is a * NULL terminated string. The name must also include one of the * following namespace prefixes: * * user - No restrictions and is available to user applications. * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use. * system - Used for access control lists (system.nfs4_acl, etc). * security - Used by SELinux to store a files security context. * * The value under Linux to limited to 65536 bytes of binary data. * In practice, individual xattrs tend to be much smaller than this * and are typically less than 100 bytes. A good example of this * are the security.selinux xattrs which are less than 100 bytes and * exist for every file when xattr labeling is enabled. * * The Linux xattr implementation has been written to take advantage of * this typical usage. When the dataset property 'xattr=sa' is set, * then xattrs will be preferentially stored as System Attributes (SA). * This allows tiny xattrs (~100 bytes) to be stored with the dnode and * up to 64k of xattrs to be stored in the spill block. If additional * xattr space is required, which is unlikely under Linux, they will * be stored using the traditional directory approach. * * This optimization results in roughly a 3x performance improvement * when accessing xattrs because it avoids the need to perform a seek * for every xattr value. When multiple xattrs are stored per-file * the performance improvements are even greater because all of the * xattrs stored in the spill block will be cached. * * However, by default SA based xattrs are disabled in the Linux port * to maximize compatibility with other implementations. If you do * enable SA based xattrs then they will not be visible on platforms * which do not support this feature. * * NOTE: One additional consequence of the xattr directory implementation * is that when an extended attribute is manipulated an inode is created. * This inode will exist in the Linux inode cache but there will be no * associated entry in the dentry cache which references it. This is * safe but it may result in some confusion. Enabling SA based xattrs * largely avoids the issue except in the overflow case. */ #include #include #include #include #include #include #include enum xattr_permission { XAPERM_DENY, XAPERM_ALLOW, XAPERM_COMPAT, }; typedef struct xattr_filldir { size_t size; size_t offset; char *buf; struct dentry *dentry; } xattr_filldir_t; static enum xattr_permission zpl_xattr_permission(xattr_filldir_t *, const char *, int); static int zfs_xattr_compat = 0; /* * Determine is a given xattr name should be visible and if so copy it * in to the provided buffer (xf->buf). */ static int zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len) { enum xattr_permission perm; /* Check permissions using the per-namespace list xattr handler. */ perm = zpl_xattr_permission(xf, name, name_len); if (perm == XAPERM_DENY) return (0); /* Prefix the name with "user." if it does not have a namespace. */ if (perm == XAPERM_COMPAT) { if (xf->buf) { if (xf->offset + XATTR_USER_PREFIX_LEN + 1 > xf->size) return (-ERANGE); memcpy(xf->buf + xf->offset, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); xf->buf[xf->offset + XATTR_USER_PREFIX_LEN] = '\0'; } xf->offset += XATTR_USER_PREFIX_LEN; } /* When xf->buf is NULL only calculate the required size. */ if (xf->buf) { if (xf->offset + name_len + 1 > xf->size) return (-ERANGE); memcpy(xf->buf + xf->offset, name, name_len); xf->buf[xf->offset + name_len] = '\0'; } xf->offset += (name_len + 1); return (0); } /* * Read as many directory entry names as will fit in to the provided buffer, * or when no buffer is provided calculate the required buffer size. */ static int zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf) { zap_cursor_t zc; zap_attribute_t zap; int error; zap_cursor_init(&zc, ITOZSB(dxip)->z_os, ITOZ(dxip)->z_id); while ((error = -zap_cursor_retrieve(&zc, &zap)) == 0) { if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { error = -ENXIO; break; } error = zpl_xattr_filldir(xf, zap.za_name, strlen(zap.za_name)); if (error) break; zap_cursor_advance(&zc); } zap_cursor_fini(&zc); if (error == -ENOENT) error = 0; return (error); } static ssize_t zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr) { struct inode *ip = xf->dentry->d_inode; struct inode *dxip = NULL; znode_t *dxzp; int error; /* Lookup the xattr directory */ error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR, cr, NULL, NULL); if (error) { if (error == -ENOENT) error = 0; return (error); } dxip = ZTOI(dxzp); error = zpl_xattr_readdir(dxip, xf); iput(dxip); return (error); } static ssize_t zpl_xattr_list_sa(xattr_filldir_t *xf) { znode_t *zp = ITOZ(xf->dentry->d_inode); nvpair_t *nvp = NULL; int error = 0; mutex_enter(&zp->z_lock); if (zp->z_xattr_cached == NULL) error = -zfs_sa_get_xattr(zp); mutex_exit(&zp->z_lock); if (error) return (error); ASSERT(zp->z_xattr_cached); while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) { ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); error = zpl_xattr_filldir(xf, nvpair_name(nvp), strlen(nvpair_name(nvp))); if (error) return (error); } return (0); } ssize_t zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { znode_t *zp = ITOZ(dentry->d_inode); zfsvfs_t *zfsvfs = ZTOZSB(zp); xattr_filldir_t xf = { buffer_size, 0, buffer, dentry }; cred_t *cr = CRED(); fstrans_cookie_t cookie; int error = 0; crhold(cr); cookie = spl_fstrans_mark(); if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) goto out1; rw_enter(&zp->z_xattr_lock, RW_READER); if (zfsvfs->z_use_sa && zp->z_is_sa) { error = zpl_xattr_list_sa(&xf); if (error) goto out; } error = zpl_xattr_list_dir(&xf, cr); if (error) goto out; error = xf.offset; out: rw_exit(&zp->z_xattr_lock); zpl_exit(zfsvfs, FTAG); out1: spl_fstrans_unmark(cookie); crfree(cr); return (error); } static int zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, size_t size, cred_t *cr) { fstrans_cookie_t cookie; struct inode *xip = NULL; znode_t *dxzp = NULL; znode_t *xzp = NULL; int error; /* Lookup the xattr directory */ error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR, cr, NULL, NULL); if (error) goto out; /* Lookup a specific xattr name in the directory */ error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL); if (error) goto out; xip = ZTOI(xzp); if (!size) { error = i_size_read(xip); goto out; } if (size < i_size_read(xip)) { error = -ERANGE; goto out; } struct iovec iov; iov.iov_base = (void *)value; iov.iov_len = size; zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, size, 0); cookie = spl_fstrans_mark(); error = -zfs_read(ITOZ(xip), &uio, 0, cr); spl_fstrans_unmark(cookie); if (error == 0) error = size - zfs_uio_resid(&uio); out: if (xzp) zrele(xzp); if (dxzp) zrele(dxzp); return (error); } static int zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size) { znode_t *zp = ITOZ(ip); uchar_t *nv_value; uint_t nv_size; int error = 0; ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); mutex_enter(&zp->z_lock); if (zp->z_xattr_cached == NULL) error = -zfs_sa_get_xattr(zp); mutex_exit(&zp->z_lock); if (error) return (error); ASSERT(zp->z_xattr_cached); error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name, &nv_value, &nv_size); if (error) return (error); if (size == 0 || value == NULL) return (nv_size); if (size < nv_size) return (-ERANGE); memcpy(value, nv_value, nv_size); return (nv_size); } static int __zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); if (zfsvfs->z_use_sa && zp->z_is_sa) { error = zpl_xattr_get_sa(ip, name, value, size); if (error != -ENOENT) goto out; } error = zpl_xattr_get_dir(ip, name, value, size, cr); out: if (error == -ENOENT) error = -ENODATA; return (error); } #define XATTR_NOENT 0x0 #define XATTR_IN_SA 0x1 #define XATTR_IN_DIR 0x2 /* check where the xattr resides */ static int __zpl_xattr_where(struct inode *ip, const char *name, int *where, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ASSERT(where); ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); *where = XATTR_NOENT; if (zfsvfs->z_use_sa && zp->z_is_sa) { error = zpl_xattr_get_sa(ip, name, NULL, 0); if (error >= 0) *where |= XATTR_IN_SA; else if (error != -ENOENT) return (error); } error = zpl_xattr_get_dir(ip, name, NULL, 0, cr); if (error >= 0) *where |= XATTR_IN_DIR; else if (error != -ENOENT) return (error); if (*where == (XATTR_IN_SA|XATTR_IN_DIR)) cmn_err(CE_WARN, "ZFS: inode %p has xattr \"%s\"" " in both SA and dir", ip, name); if (*where == XATTR_NOENT) error = -ENODATA; else error = 0; return (error); } static int zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); cred_t *cr = CRED(); fstrans_cookie_t cookie; int error; crhold(cr); cookie = spl_fstrans_mark(); if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) goto out; rw_enter(&zp->z_xattr_lock, RW_READER); error = __zpl_xattr_get(ip, name, value, size, cr); rw_exit(&zp->z_xattr_lock); zpl_exit(zfsvfs, FTAG); out: spl_fstrans_unmark(cookie); crfree(cr); return (error); } static int zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, size_t size, int flags, cred_t *cr) { znode_t *dxzp = NULL; znode_t *xzp = NULL; vattr_t *vap = NULL; int lookup_flags, error; const int xattr_mode = S_IFREG | 0644; loff_t pos = 0; /* * Lookup the xattr directory. When we're adding an entry pass * CREATE_XATTR_DIR to ensure the xattr directory is created. * When removing an entry this flag is not passed to avoid * unnecessarily creating a new xattr directory. */ lookup_flags = LOOKUP_XATTR; if (value != NULL) lookup_flags |= CREATE_XATTR_DIR; error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, lookup_flags, cr, NULL, NULL); if (error) goto out; /* Lookup a specific xattr name in the directory */ error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL); if (error && (error != -ENOENT)) goto out; error = 0; /* Remove a specific name xattr when value is set to NULL. */ if (value == NULL) { if (xzp) error = -zfs_remove(dxzp, (char *)name, cr, 0); goto out; } /* Lookup failed create a new xattr. */ if (xzp == NULL) { vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); vap->va_mode = xattr_mode; vap->va_mask = ATTR_MODE; vap->va_uid = crgetuid(cr); vap->va_gid = crgetgid(cr); error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp, cr, ATTR_NOACLCHECK, NULL, zfs_init_idmap); if (error) goto out; } ASSERT(xzp != NULL); error = -zfs_freesp(xzp, 0, 0, xattr_mode, TRUE); if (error) goto out; error = -zfs_write_simple(xzp, value, size, pos, NULL); out: if (error == 0) { - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); zfs_mark_inode_dirty(ip); } if (vap) kmem_free(vap, sizeof (vattr_t)); if (xzp) zrele(xzp); if (dxzp) zrele(dxzp); if (error == -ENOENT) error = -ENODATA; ASSERT3S(error, <=, 0); return (error); } static int zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value, size_t size, int flags, cred_t *cr) { znode_t *zp = ITOZ(ip); nvlist_t *nvl; size_t sa_size; int error = 0; mutex_enter(&zp->z_lock); if (zp->z_xattr_cached == NULL) error = -zfs_sa_get_xattr(zp); mutex_exit(&zp->z_lock); if (error) return (error); ASSERT(zp->z_xattr_cached); nvl = zp->z_xattr_cached; if (value == NULL) { error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); if (error == -ENOENT) error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr); } else { /* Limited to 32k to keep nvpair memory allocations small */ if (size > DXATTR_MAX_ENTRY_SIZE) return (-EFBIG); /* Prevent the DXATTR SA from consuming the entire SA region */ error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); if (error) return (error); if (sa_size > DXATTR_MAX_SA_SIZE) return (-EFBIG); error = -nvlist_add_byte_array(nvl, name, (uchar_t *)value, size); } /* * Update the SA for additions, modifications, and removals. On * error drop the inconsistent cached version of the nvlist, it * will be reconstructed from the ARC when next accessed. */ if (error == 0) error = -zfs_sa_set_xattr(zp, name, value, size); if (error) { nvlist_free(nvl); zp->z_xattr_cached = NULL; } ASSERT3S(error, <=, 0); return (error); } static int zpl_xattr_set(struct inode *ip, const char *name, const void *value, size_t size, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); cred_t *cr = CRED(); fstrans_cookie_t cookie; int where; int error; crhold(cr); cookie = spl_fstrans_mark(); if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) goto out1; rw_enter(&zp->z_xattr_lock, RW_WRITER); /* * Before setting the xattr check to see if it already exists. * This is done to ensure the following optional flags are honored. * * XATTR_CREATE: fail if xattr already exists * XATTR_REPLACE: fail if xattr does not exist * * We also want to know if it resides in sa or dir, so we can make * sure we don't end up with duplicate in both places. */ error = __zpl_xattr_where(ip, name, &where, cr); if (error < 0) { if (error != -ENODATA) goto out; if (flags & XATTR_REPLACE) goto out; /* The xattr to be removed already doesn't exist */ error = 0; if (value == NULL) goto out; } else { error = -EEXIST; if (flags & XATTR_CREATE) goto out; } /* Preferentially store the xattr as a SA for better performance */ if (zfsvfs->z_use_sa && zp->z_is_sa && (zfsvfs->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) { error = zpl_xattr_set_sa(ip, name, value, size, flags, cr); if (error == 0) { /* * Successfully put into SA, we need to clear the one * in dir. */ if (where & XATTR_IN_DIR) zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr); goto out; } } error = zpl_xattr_set_dir(ip, name, value, size, flags, cr); /* * Successfully put into dir, we need to clear the one in SA. */ if (error == 0 && (where & XATTR_IN_SA)) zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr); out: rw_exit(&zp->z_xattr_lock); zpl_exit(zfsvfs, FTAG); out1: spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } /* * Extended user attributes * * "Extended user attributes may be assigned to files and directories for * storing arbitrary additional information such as the mime type, * character set or encoding of a file. The access permissions for user * attributes are defined by the file permission bits: read permission * is required to retrieve the attribute value, and writer permission is * required to change it. * * The file permission bits of regular files and directories are * interpreted differently from the file permission bits of special * files and symbolic links. For regular files and directories the file * permission bits define access to the file's contents, while for * device special files they define access to the device described by * the special file. The file permissions of symbolic links are not * used in access checks. These differences would allow users to * consume filesystem resources in a way not controllable by disk quotas * for group or world writable special files and directories. * * For this reason, extended user attributes are allowed only for * regular files and directories, and access to extended user attributes * is restricted to the owner and to users with appropriate capabilities * for directories with the sticky bit set (see the chmod(1) manual page * for an explanation of the sticky bit)." - xattr(7) * * ZFS allows extended user attributes to be disabled administratively * by setting the 'xattr=off' property on the dataset. */ static int __zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { return (ITOZSB(ip)->z_flags & ZSB_XATTR); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_user_list); static int __zpl_xattr_user_get(struct inode *ip, const char *name, void *value, size_t size) { int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif if (ZFS_XA_NS_PREFIX_FORBIDDEN(name)) return (-EINVAL); if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) return (-EOPNOTSUPP); /* * Try to look up the name with the namespace prefix first for * compatibility with xattrs from this platform. If that fails, * try again without the namespace prefix for compatibility with * other platforms. */ char *xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); kmem_strfree(xattr_name); if (error == -ENODATA) error = zpl_xattr_get(ip, name, value, size); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get); static int __zpl_xattr_user_set(zidmap_t *user_ns, struct inode *ip, const char *name, const void *value, size_t size, int flags) { (void) user_ns; int error = 0; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif if (ZFS_XA_NS_PREFIX_FORBIDDEN(name)) return (-EINVAL); if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) return (-EOPNOTSUPP); /* * Remove alternate compat version of the xattr so we only set the * version specified by the zfs_xattr_compat tunable. * * The following flags must be handled correctly: * * XATTR_CREATE: fail if xattr already exists * XATTR_REPLACE: fail if xattr does not exist */ char *prefixed_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); const char *clear_name, *set_name; if (zfs_xattr_compat) { clear_name = prefixed_name; set_name = name; } else { clear_name = name; set_name = prefixed_name; } /* * Clear the old value with the alternative name format, if it exists. */ error = zpl_xattr_set(ip, clear_name, NULL, 0, flags); /* * XATTR_CREATE was specified and we failed to clear the xattr * because it already exists. Stop here. */ if (error == -EEXIST) goto out; /* * If XATTR_REPLACE was specified and we succeeded to clear * an xattr, we don't need to replace anything when setting * the new value. If we failed with -ENODATA that's fine, * there was nothing to be cleared and we can ignore the error. */ if (error == 0) flags &= ~XATTR_REPLACE; /* * Set the new value with the configured name format. */ error = zpl_xattr_set(ip, set_name, value, size, flags); out: kmem_strfree(prefixed_name); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set); static xattr_handler_t zpl_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = zpl_xattr_user_list, .get = zpl_xattr_user_get, .set = zpl_xattr_user_set, }; /* * Trusted extended attributes * * "Trusted extended attributes are visible and accessible only to * processes that have the CAP_SYS_ADMIN capability. Attributes in this * class are used to implement mechanisms in user space (i.e., outside * the kernel) which keep information in extended attributes to which * ordinary processes should not have access." - xattr(7) */ static int __zpl_xattr_trusted_list(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { return (capable(CAP_SYS_ADMIN)); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_trusted_list); static int __zpl_xattr_trusted_get(struct inode *ip, const char *name, void *value, size_t size) { char *xattr_name; int error; if (!capable(CAP_SYS_ADMIN)) return (-EACCES); /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get); static int __zpl_xattr_trusted_set(zidmap_t *user_ns, struct inode *ip, const char *name, const void *value, size_t size, int flags) { (void) user_ns; char *xattr_name; int error; if (!capable(CAP_SYS_ADMIN)) return (-EACCES); /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); error = zpl_xattr_set(ip, xattr_name, value, size, flags); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set); static xattr_handler_t zpl_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = zpl_xattr_trusted_list, .get = zpl_xattr_trusted_get, .set = zpl_xattr_trusted_set, }; /* * Extended security attributes * * "The security attribute namespace is used by kernel security modules, * such as Security Enhanced Linux, and also to implement file * capabilities (see capabilities(7)). Read and write access * permissions to security attributes depend on the policy implemented * for each security attribute by the security module. When no security * module is loaded, all processes have read access to extended security * attributes, and write access is limited to processes that have the * CAP_SYS_ADMIN capability." - xattr(7) */ static int __zpl_xattr_security_list(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { return (1); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_security_list); static int __zpl_xattr_security_get(struct inode *ip, const char *name, void *value, size_t size) { char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get); static int __zpl_xattr_security_set(zidmap_t *user_ns, struct inode *ip, const char *name, const void *value, size_t size, int flags) { (void) user_ns; char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); error = zpl_xattr_set(ip, xattr_name, value, size, flags); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set); static int zpl_xattr_security_init_impl(struct inode *ip, const struct xattr *xattrs, void *fs_info) { const struct xattr *xattr; int error = 0; for (xattr = xattrs; xattr->name != NULL; xattr++) { error = __zpl_xattr_security_set(NULL, ip, xattr->name, xattr->value, xattr->value_len, 0); if (error < 0) break; } return (error); } int zpl_xattr_security_init(struct inode *ip, struct inode *dip, const struct qstr *qstr) { return security_inode_init_security(ip, dip, qstr, &zpl_xattr_security_init_impl, NULL); } /* * Security xattr namespace handlers. */ static xattr_handler_t zpl_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = zpl_xattr_security_list, .get = zpl_xattr_security_get, .set = zpl_xattr_security_set, }; /* * Extended system attributes * * "Extended system attributes are used by the kernel to store system * objects such as Access Control Lists. Read and write access permissions * to system attributes depend on the policy implemented for each system * attribute implemented by filesystems in the kernel." - xattr(7) */ #ifdef CONFIG_FS_POSIX_ACL static int zpl_set_acl_impl(struct inode *ip, struct posix_acl *acl, int type) { char *name, *value = NULL; int error = 0; size_t size = 0; if (S_ISLNK(ip->i_mode)) return (-EOPNOTSUPP); switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { umode_t mode = ip->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error < 0) { return (error); } else { /* * The mode bits will have been set by * ->zfs_setattr()->zfs_acl_chmod_setattr() * using the ZFS ACL conversion. If they * differ from the Posix ACL conversion dirty * the inode to write the Posix mode bits. */ if (ip->i_mode != mode) { ip->i_mode = ITOZ(ip)->z_mode = mode; - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, + current_time(ip)); zfs_mark_inode_dirty(ip); } if (error == 0) acl = NULL; } } break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; if (!S_ISDIR(ip->i_mode)) return (acl ? -EACCES : 0); break; default: return (-EINVAL); } if (acl) { size = posix_acl_xattr_size(acl->a_count); value = kmem_alloc(size, KM_SLEEP); error = zpl_acl_to_xattr(acl, value, size); if (error < 0) { kmem_free(value, size); return (error); } } error = zpl_xattr_set(ip, name, value, size, 0); if (value) kmem_free(value, size); if (!error) { if (acl) zpl_set_cached_acl(ip, type, acl); else zpl_forget_cached_acl(ip, type); } return (error); } #ifdef HAVE_SET_ACL int #ifdef HAVE_SET_ACL_USERNS zpl_set_acl(struct user_namespace *userns, struct inode *ip, struct posix_acl *acl, int type) #elif defined(HAVE_SET_ACL_IDMAP_DENTRY) zpl_set_acl(struct mnt_idmap *userns, struct dentry *dentry, struct posix_acl *acl, int type) #elif defined(HAVE_SET_ACL_USERNS_DENTRY_ARG2) zpl_set_acl(struct user_namespace *userns, struct dentry *dentry, struct posix_acl *acl, int type) #else zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) #endif /* HAVE_SET_ACL_USERNS */ { #ifdef HAVE_SET_ACL_USERNS_DENTRY_ARG2 return (zpl_set_acl_impl(d_inode(dentry), acl, type)); #elif defined(HAVE_SET_ACL_IDMAP_DENTRY) return (zpl_set_acl_impl(d_inode(dentry), acl, type)); #else return (zpl_set_acl_impl(ip, acl, type)); #endif /* HAVE_SET_ACL_USERNS_DENTRY_ARG2 */ } #endif /* HAVE_SET_ACL */ static struct posix_acl * zpl_get_acl_impl(struct inode *ip, int type) { struct posix_acl *acl; void *value = NULL; char *name; /* * As of Linux 3.14, the kernel get_acl will check this for us. * Also as of Linux 4.7, comparing against ACL_NOT_CACHED is wrong * as the kernel get_acl will set it to temporary sentinel value. */ #ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE acl = get_cached_acl(ip, type); if (acl != ACL_NOT_CACHED) return (acl); #endif switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return (ERR_PTR(-EINVAL)); } int size = zpl_xattr_get(ip, name, NULL, 0); if (size > 0) { value = kmem_alloc(size, KM_SLEEP); size = zpl_xattr_get(ip, name, value, size); } if (size > 0) { acl = zpl_acl_from_xattr(value, size); } else if (size == -ENODATA || size == -ENOSYS) { acl = NULL; } else { acl = ERR_PTR(-EIO); } if (size > 0) kmem_free(value, size); /* As of Linux 4.7, the kernel get_acl will set this for us */ #ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE if (!IS_ERR(acl)) zpl_set_cached_acl(ip, type, acl); #endif return (acl); } #if defined(HAVE_GET_ACL_RCU) || defined(HAVE_GET_INODE_ACL) struct posix_acl * zpl_get_acl(struct inode *ip, int type, bool rcu) { if (rcu) return (ERR_PTR(-ECHILD)); return (zpl_get_acl_impl(ip, type)); } #elif defined(HAVE_GET_ACL) struct posix_acl * zpl_get_acl(struct inode *ip, int type) { return (zpl_get_acl_impl(ip, type)); } #else #error "Unsupported iops->get_acl() implementation" #endif /* HAVE_GET_ACL_RCU */ int zpl_init_acl(struct inode *ip, struct inode *dir) { struct posix_acl *acl = NULL; int error = 0; if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (!S_ISLNK(ip->i_mode)) { acl = zpl_get_acl_impl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) return (PTR_ERR(acl)); if (!acl) { ITOZ(ip)->z_mode = (ip->i_mode &= ~current_umask()); - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); zfs_mark_inode_dirty(ip); return (0); } } if (acl) { umode_t mode; if (S_ISDIR(ip->i_mode)) { error = zpl_set_acl_impl(ip, acl, ACL_TYPE_DEFAULT); if (error) goto out; } mode = ip->i_mode; error = __posix_acl_create(&acl, GFP_KERNEL, &mode); if (error >= 0) { ip->i_mode = ITOZ(ip)->z_mode = mode; zfs_mark_inode_dirty(ip); if (error > 0) { error = zpl_set_acl_impl(ip, acl, ACL_TYPE_ACCESS); } } } out: zpl_posix_acl_release(acl); return (error); } int zpl_chmod_acl(struct inode *ip) { struct posix_acl *acl; int error; if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (S_ISLNK(ip->i_mode)) return (-EOPNOTSUPP); acl = zpl_get_acl_impl(ip, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return (PTR_ERR(acl)); error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode); if (!error) error = zpl_set_acl_impl(ip, acl, ACL_TYPE_ACCESS); zpl_posix_acl_release(acl); return (error); } static int __zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS); if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (list && xattr_size <= list_size) memcpy(list, xattr_name, xattr_size); return (xattr_size); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_access); static int __zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT); if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (list && xattr_size <= list_size) memcpy(list, xattr_name, xattr_size); return (xattr_size); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_default); static int __zpl_xattr_acl_get_access(struct inode *ip, const char *name, void *buffer, size_t size) { struct posix_acl *acl; int type = ACL_TYPE_ACCESS; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); acl = zpl_get_acl_impl(ip, type); if (IS_ERR(acl)) return (PTR_ERR(acl)); if (acl == NULL) return (-ENODATA); error = zpl_acl_to_xattr(acl, buffer, size); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_access); static int __zpl_xattr_acl_get_default(struct inode *ip, const char *name, void *buffer, size_t size) { struct posix_acl *acl; int type = ACL_TYPE_DEFAULT; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); acl = zpl_get_acl_impl(ip, type); if (IS_ERR(acl)) return (PTR_ERR(acl)); if (acl == NULL) return (-ENODATA); error = zpl_acl_to_xattr(acl, buffer, size); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default); static int __zpl_xattr_acl_set_access(zidmap_t *mnt_ns, struct inode *ip, const char *name, const void *value, size_t size, int flags) { struct posix_acl *acl; int type = ACL_TYPE_ACCESS; int error = 0; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); #if defined(HAVE_XATTR_SET_USERNS) || defined(HAVE_XATTR_SET_IDMAP) if (!zpl_inode_owner_or_capable(mnt_ns, ip)) return (-EPERM); #else (void) mnt_ns; if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EPERM); #endif if (value) { acl = zpl_acl_from_xattr(value, size); if (IS_ERR(acl)) return (PTR_ERR(acl)); else if (acl) { error = zpl_posix_acl_valid(ip, acl); if (error) { zpl_posix_acl_release(acl); return (error); } } } else { acl = NULL; } error = zpl_set_acl_impl(ip, acl, type); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access); static int __zpl_xattr_acl_set_default(zidmap_t *mnt_ns, struct inode *ip, const char *name, const void *value, size_t size, int flags) { struct posix_acl *acl; int type = ACL_TYPE_DEFAULT; int error = 0; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); #if defined(HAVE_XATTR_SET_USERNS) || defined(HAVE_XATTR_SET_IDMAP) if (!zpl_inode_owner_or_capable(mnt_ns, ip)) return (-EPERM); #else (void) mnt_ns; if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EPERM); #endif if (value) { acl = zpl_acl_from_xattr(value, size); if (IS_ERR(acl)) return (PTR_ERR(acl)); else if (acl) { error = zpl_posix_acl_valid(ip, acl); if (error) { zpl_posix_acl_release(acl); return (error); } } } else { acl = NULL; } error = zpl_set_acl_impl(ip, acl, type); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_default); /* * ACL access xattr namespace handlers. * * Use .name instead of .prefix when available. xattr_resolve_name will match * whole name and reject anything that has .name only as prefix. */ static xattr_handler_t zpl_xattr_acl_access_handler = { #ifdef HAVE_XATTR_HANDLER_NAME .name = XATTR_NAME_POSIX_ACL_ACCESS, #else .prefix = XATTR_NAME_POSIX_ACL_ACCESS, #endif .list = zpl_xattr_acl_list_access, .get = zpl_xattr_acl_get_access, .set = zpl_xattr_acl_set_access, #if defined(HAVE_XATTR_LIST_SIMPLE) || \ defined(HAVE_XATTR_LIST_DENTRY) || \ defined(HAVE_XATTR_LIST_HANDLER) .flags = ACL_TYPE_ACCESS, #endif }; /* * ACL default xattr namespace handlers. * * Use .name instead of .prefix when available. xattr_resolve_name will match * whole name and reject anything that has .name only as prefix. */ static xattr_handler_t zpl_xattr_acl_default_handler = { #ifdef HAVE_XATTR_HANDLER_NAME .name = XATTR_NAME_POSIX_ACL_DEFAULT, #else .prefix = XATTR_NAME_POSIX_ACL_DEFAULT, #endif .list = zpl_xattr_acl_list_default, .get = zpl_xattr_acl_get_default, .set = zpl_xattr_acl_set_default, #if defined(HAVE_XATTR_LIST_SIMPLE) || \ defined(HAVE_XATTR_LIST_DENTRY) || \ defined(HAVE_XATTR_LIST_HANDLER) .flags = ACL_TYPE_DEFAULT, #endif }; #endif /* CONFIG_FS_POSIX_ACL */ xattr_handler_t *zpl_xattr_handlers[] = { &zpl_xattr_security_handler, &zpl_xattr_trusted_handler, &zpl_xattr_user_handler, #ifdef CONFIG_FS_POSIX_ACL &zpl_xattr_acl_access_handler, &zpl_xattr_acl_default_handler, #endif /* CONFIG_FS_POSIX_ACL */ NULL }; static const struct xattr_handler * zpl_xattr_handler(const char *name) { if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) == 0) return (&zpl_xattr_user_handler); if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) return (&zpl_xattr_trusted_handler); if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) return (&zpl_xattr_security_handler); #ifdef CONFIG_FS_POSIX_ACL if (strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, sizeof (XATTR_NAME_POSIX_ACL_ACCESS)) == 0) return (&zpl_xattr_acl_access_handler); if (strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof (XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) return (&zpl_xattr_acl_default_handler); #endif /* CONFIG_FS_POSIX_ACL */ return (NULL); } static enum xattr_permission zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len) { const struct xattr_handler *handler; struct dentry *d __maybe_unused = xf->dentry; enum xattr_permission perm = XAPERM_ALLOW; handler = zpl_xattr_handler(name); if (handler == NULL) { /* Do not expose FreeBSD system namespace xattrs. */ if (ZFS_XA_NS_PREFIX_MATCH(FREEBSD, name)) return (XAPERM_DENY); /* * Anything that doesn't match a known namespace gets put in the * user namespace for compatibility with other platforms. */ perm = XAPERM_COMPAT; handler = &zpl_xattr_user_handler; } if (handler->list) { #if defined(HAVE_XATTR_LIST_SIMPLE) if (!handler->list(d)) return (XAPERM_DENY); #elif defined(HAVE_XATTR_LIST_DENTRY) if (!handler->list(d, NULL, 0, name, name_len, 0)) return (XAPERM_DENY); #elif defined(HAVE_XATTR_LIST_HANDLER) if (!handler->list(handler, d, NULL, 0, name, name_len)) return (XAPERM_DENY); #endif } return (perm); } #if defined(CONFIG_FS_POSIX_ACL) && \ (!defined(HAVE_POSIX_ACL_RELEASE) || \ defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY)) struct acl_rel_struct { struct acl_rel_struct *next; struct posix_acl *acl; clock_t time; }; #define ACL_REL_GRACE (60*HZ) #define ACL_REL_WINDOW (1*HZ) #define ACL_REL_SCHED (ACL_REL_GRACE+ACL_REL_WINDOW) /* * Lockless multi-producer single-consumer fifo list. * Nodes are added to tail and removed from head. Tail pointer is our * synchronization point. It always points to the next pointer of the last * node, or head if list is empty. */ static struct acl_rel_struct *acl_rel_head = NULL; static struct acl_rel_struct **acl_rel_tail = &acl_rel_head; static void zpl_posix_acl_free(void *arg) { struct acl_rel_struct *freelist = NULL; struct acl_rel_struct *a; clock_t new_time; boolean_t refire = B_FALSE; ASSERT3P(acl_rel_head, !=, NULL); while (acl_rel_head) { a = acl_rel_head; if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) { /* * If a is the last node we need to reset tail, but we * need to use cmpxchg to make sure it is still the * last node. */ if (acl_rel_tail == &a->next) { acl_rel_head = NULL; if (cmpxchg(&acl_rel_tail, &a->next, &acl_rel_head) == &a->next) { ASSERT3P(a->next, ==, NULL); a->next = freelist; freelist = a; break; } } /* * a is not last node, make sure next pointer is set * by the adder and advance the head. */ while (READ_ONCE(a->next) == NULL) cpu_relax(); acl_rel_head = a->next; a->next = freelist; freelist = a; } else { /* * a is still in grace period. We are responsible to * reschedule the free task, since adder will only do * so if list is empty. */ new_time = a->time + ACL_REL_SCHED; refire = B_TRUE; break; } } if (refire) taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, NULL, TQ_SLEEP, new_time); while (freelist) { a = freelist; freelist = a->next; kfree(a->acl); kmem_free(a, sizeof (struct acl_rel_struct)); } } void zpl_posix_acl_release_impl(struct posix_acl *acl) { struct acl_rel_struct *a, **prev; a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP); a->next = NULL; a->acl = acl; a->time = ddi_get_lbolt(); /* atomically points tail to us and get the previous tail */ prev = xchg(&acl_rel_tail, &a->next); ASSERT3P(*prev, ==, NULL); *prev = a; /* if it was empty before, schedule the free task */ if (prev == &acl_rel_head) taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED); } #endif ZFS_MODULE_PARAM(zfs, zfs_, xattr_compat, INT, ZMOD_RW, "Use legacy ZFS xattr naming for writing new user namespace xattrs"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 76521c95911e..f94ce69fb9e2 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -1,1628 +1,1646 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_BLK_MQ #include #endif static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync); static unsigned int zvol_major = ZVOL_MAJOR; static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static unsigned int zvol_open_timeout_ms = 1000; #endif static unsigned int zvol_threads = 0; #ifdef HAVE_BLK_MQ static unsigned int zvol_blk_mq_threads = 0; static unsigned int zvol_blk_mq_actual_threads; static boolean_t zvol_use_blk_mq = B_FALSE; /* * The maximum number of volblocksize blocks to process per thread. Typically, * write heavy workloads preform better with higher values here, and read * heavy workloads preform better with lower values, but that's not a hard * and fast rule. It's basically a knob to tune between "less overhead with * less parallelism" and "more overhead, but more parallelism". * * '8' was chosen as a reasonable, balanced, default based off of sequential * read and write tests to a zvol in an NVMe pool (with 16 CPUs). */ static unsigned int zvol_blk_mq_blocks_per_thread = 8; #endif #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ #endif /* * Finalize our BIO or request. */ #ifdef HAVE_BLK_MQ #define END_IO(zv, bio, rq, error) do { \ if (bio) { \ BIO_END_IO(bio, error); \ } else { \ blk_mq_end_request(rq, errno_to_bi_status(error)); \ } \ } while (0) #else #define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) #endif #ifdef HAVE_BLK_MQ static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; static unsigned int zvol_actual_blk_mq_queue_depth; #endif struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ struct request_queue *zvo_queue; /* request queue */ dev_t zvo_dev; /* device id */ #ifdef HAVE_BLK_MQ struct blk_mq_tag_set tag_set; #endif /* Set from the global 'zvol_use_blk_mq' at zvol load */ boolean_t use_blk_mq; }; static taskq_t *zvol_taskq; static struct ida zvol_ida; typedef struct zv_request_stack { zvol_state_t *zv; struct bio *bio; struct request *rq; } zv_request_t; typedef struct zv_work { struct request *rq; struct work_struct work; } zv_work_t; typedef struct zv_request_task { zv_request_t zvr; taskq_ent_t ent; } zv_request_task_t; static zv_request_task_t * zv_request_task_create(zv_request_t zvr) { zv_request_task_t *task; task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); taskq_init_ent(&task->ent); task->zvr = zvr; return (task); } static void zv_request_task_free(zv_request_task_t *task) { kmem_free(task, sizeof (*task)); } #ifdef HAVE_BLK_MQ /* * This is called when a new block multiqueue request comes in. A request * contains one or more BIOs. */ static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; zvol_state_t *zv = rq->q->queuedata; /* Tell the kernel that we are starting to process this request */ blk_mq_start_request(rq); if (blk_rq_is_passthrough(rq)) { /* Skip non filesystem request */ blk_mq_end_request(rq, BLK_STS_IOERR); return (BLK_STS_IOERR); } zvol_request_impl(zv, NULL, rq, 0); /* Acknowledge to the kernel that we got this request */ return (BLK_STS_OK); } static struct blk_mq_ops zvol_blk_mq_queue_ops = { .queue_rq = zvol_mq_queue_rq, }; /* Initialize our blk-mq struct */ static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) { struct zvol_state_os *zso = zv->zv_zso; memset(&zso->tag_set, 0, sizeof (zso->tag_set)); /* Initialize tag set. */ zso->tag_set.ops = &zvol_blk_mq_queue_ops; zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; zso->tag_set.numa_node = NUMA_NO_NODE; zso->tag_set.cmd_size = 0; /* * We need BLK_MQ_F_BLOCKING here since we do blocking calls in * zvol_request_impl() */ zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; zso->tag_set.driver_data = zv; return (blk_mq_alloc_tag_set(&zso->tag_set)); } #endif /* HAVE_BLK_MQ */ /* * Given a path, return TRUE if path is a ZVOL. */ boolean_t zvol_os_is_zvol(const char *path) { dev_t dev = 0; if (vdev_lookup_bdev(path, &dev) != 0) return (B_FALSE); if (MAJOR(dev) == zvol_major) return (B_TRUE); return (B_FALSE); } static void zvol_write(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; zvol_state_t *zv = zvr->zv; struct request_queue *q; struct gendisk *disk; unsigned long start_time = 0; boolean_t acct = B_FALSE; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); q = zv->zv_zso->zvo_queue; disk = zv->zv_zso->zvo_disk; /* bio marked as FLUSH need to flush before write */ if (io_is_flush(bio, rq)) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* Some requests are just for flush and nothing else. */ if (io_size(bio, rq) == 0) { rw_exit(&zv->zv_suspend_lock); END_IO(zv, bio, rq, 0); return; } zfs_uio_bvec_init(&uio, bio, rq); ssize_t start_resid = uio.uio_resid; /* * With use_blk_mq, accounting is done by blk_mq_start_request() * and blk_mq_end_request(), so we can skip it here. */ if (bio) { acct = blk_queue_io_stat(q); if (acct) { start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); } } boolean_t sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_WRITER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio.uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); if (error == 0) { zvol_log_write(zv, tx, off, bytes, sync); } dmu_tx_commit(tx); if (error) break; } zfs_rangelock_exit(lr); int64_t nwritten = start_resid - uio.uio_resid; dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); task_io_account_write(nwritten); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } END_IO(zv, bio, rq, -error); } static void zvol_write_task(void *arg) { zv_request_task_t *task = arg; zvol_write(&task->zvr); zv_request_task_free(task); } static void zvol_discard(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; zvol_state_t *zv = zvr->zv; uint64_t start = io_offset(bio, rq); uint64_t size = io_size(bio, rq); uint64_t end = start + size; boolean_t sync; int error = 0; dmu_tx_t *tx; struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; unsigned long start_time = 0; boolean_t acct = B_FALSE; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); if (bio) { acct = blk_queue_io_stat(q); if (acct) { start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); } } sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (end > zv->zv_volsize) { error = SET_ERROR(EIO); goto unlock; } /* * Align the request to volume block boundaries when a secure erase is * not required. This will prevent dnode_free_range() from zeroing out * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ if (!io_is_secure_erase(bio, rq)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; } if (start >= end) goto unlock; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, start, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, size, B_TRUE); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } zfs_rangelock_exit(lr); if (error == 0 && sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); unlock: rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } END_IO(zv, bio, rq, -error); } static void zvol_discard_task(void *arg) { zv_request_task_t *task = arg; zvol_discard(&task->zvr); zv_request_task_free(task); } static void zvol_read(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; boolean_t acct = B_FALSE; zvol_state_t *zv = zvr->zv; struct request_queue *q; struct gendisk *disk; unsigned long start_time = 0; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); zfs_uio_bvec_init(&uio, bio, rq); q = zv->zv_zso->zvo_queue; disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; /* * When blk-mq is being used, accounting is done by * blk_mq_start_request() and blk_mq_end_request(). */ if (bio) { acct = blk_queue_io_stat(q); if (acct) start_time = blk_generic_start_io_acct(q, disk, READ, bio); } zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } zfs_rangelock_exit(lr); int64_t nread = start_resid - uio.uio_resid; dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); task_io_account_read(nread); rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, READ, bio, start_time); } END_IO(zv, bio, rq, -error); } static void zvol_read_task(void *arg) { zv_request_task_t *task = arg; zvol_read(&task->zvr); zv_request_task_free(task); } /* * Process a BIO or request * * Either 'bio' or 'rq' should be set depending on if we are processing a * bio or a request (both should not be set). * * force_sync: Set to 0 to defer processing to a background taskq * Set to 1 to process data synchronously */ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync) { fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); int rw = io_data_dir(bio, rq); if (zvol_request_sync) force_sync = 1; zv_request_t zvr = { .zv = zv, .bio = bio, .rq = rq, }; if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_zso->zvo_disk->disk_name, (long long unsigned)offset, (long unsigned)size); END_IO(zv, bio, rq, -SET_ERROR(EIO)); goto out; } zv_request_task_t *task; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { END_IO(zv, bio, rq, -SET_ERROR(EROFS)); goto out; } /* * Prevents the zvol from being suspended, or the ZIL being * concurrently opened. Will be released after the i/o * completes. */ rw_enter(&zv->zv_suspend_lock, RW_READER); /* * Open a ZIL if this is the first time we have written to this * zvol. We protect zv->zv_zilog with zv_suspend_lock rather * than zv_state_lock so that we don't need to acquire an * additional lock in this path. */ if (zv->zv_zilog == NULL) { rw_exit(&zv->zv_suspend_lock); rw_enter(&zv->zv_suspend_lock, RW_WRITER); if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, zvol_get_data, &zv->zv_kstat.dk_zil_sums); zv->zv_flags |= ZVOL_WRITTEN_TO; /* replay / destroy done in zvol_create_minor */ VERIFY0((zv->zv_zilog->zl_header->zh_flags & ZIL_REPLAY_NEEDED)); } rw_downgrade(&zv->zv_suspend_lock); } /* * We don't want this thread to be blocked waiting for i/o to * complete, so we instead wait from a taskq callback. The * i/o may be a ZIL write (via zil_commit()), or a read of an * indirect block, or a read of a data block (if this is a * partial-block write). We will indicate that the i/o is * complete by calling END_IO() from the taskq callback. * * This design allows the calling thread to continue and * initiate more concurrent operations by calling * zvol_request() again. There are typically only a small * number of threads available to call zvol_request() (e.g. * one per iSCSI target), so keeping the latency of * zvol_request() low is important for performance. * * The zvol_request_sync module parameter allows this * behavior to be altered, for performance evaluation * purposes. If the callback blocks, setting * zvol_request_sync=1 will result in much worse performance. * * We can have up to zvol_threads concurrent i/o's being * processed for all zvols on the system. This is typically * a vast improvement over the zvol_request_sync=1 behavior * of one i/o at a time per zvol. However, an even better * design would be for zvol_request() to initiate the zio * directly, and then be notified by the zio_done callback, * which would call END_IO(). Unfortunately, the DMU/ZIL * interfaces lack this functionality (they block waiting for * the i/o to complete). */ if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { if (force_sync) { zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_discard_task, task, 0, &task->ent); } } else { if (force_sync) { zvol_write(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_write_task, task, 0, &task->ent); } } } else { /* * The SCST driver, and possibly others, may issue READ I/Os * with a length of zero bytes. These empty I/Os contain no * data and require no additional handling. */ if (size == 0) { END_IO(zv, bio, rq, 0); goto out; } rw_enter(&zv->zv_suspend_lock, RW_READER); /* See comment in WRITE case above. */ if (force_sync) { zvol_read(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_read_task, task, 0, &task->ent); } } out: spl_fstrans_unmark(cookie); } #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID static void zvol_submit_bio(struct bio *bio) #else static blk_qc_t zvol_submit_bio(struct bio *bio) #endif #else static MAKE_REQUEST_FN_RET zvol_request(struct request_queue *q, struct bio *bio) #endif { #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif #endif zvol_state_t *zv = q->queuedata; zvol_request_impl(zv, bio, NULL, 0); #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) return (BLK_QC_T_NONE); #endif } static int #ifdef HAVE_BLK_MODE_T zvol_open(struct gendisk *disk, blk_mode_t flag) #else zvol_open(struct block_device *bdev, fmode_t flag) #endif { zvol_state_t *zv; int error = 0; boolean_t drop_suspend = B_FALSE; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); hrtime_t start = gethrtime(); retry: #endif rw_enter(&zvol_state_lock, RW_READER); /* * Obtain a copy of private_data under the zvol_state_lock to make * sure that either the result of zvol free code path setting * disk->private_data to NULL is observed, or zvol_os_free() * is not called on this zv because of the positive zv_open_count. */ #ifdef HAVE_BLK_MODE_T zv = disk->private_data; #else zv = bdev->bd_disk->private_data; #endif if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(-ENXIO)); } mutex_enter(&zv->zv_state_lock); /* * Make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); } else { drop_suspend = B_TRUE; } } else { drop_suspend = B_TRUE; } } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { boolean_t drop_namespace = B_FALSE; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); /* * In all other call paths the spa_namespace_lock is taken * before the bdev->bd_mutex lock. However, on open(2) * the __blkdev_get() function calls fops->open() with the * bdev->bd_mutex lock held. This can result in a deadlock * when zvols from one pool are used as vdevs in another. * * To prevent a lock inversion deadlock we preemptively * take the spa_namespace_lock. Normally the lock will not * be contended and this is safe because spa_open_common() * handles the case where the caller already holds the * spa_namespace_lock. * * When the lock cannot be aquired after multiple retries * this must be the vdev on zvol deadlock case and we have * no choice but to return an error. For 5.12 and older * kernels returning -ERESTARTSYS will result in the * bdev->bd_mutex being dropped, then reacquired, and * fops->open() being called again. This process can be * repeated safely until both locks are acquired. For 5.13 * and newer the -ERESTARTSYS retry logic was removed from * the kernel so the only option is to return the error for * the caller to handle it. */ if (!mutex_owned(&spa_namespace_lock)) { if (!mutex_tryenter(&spa_namespace_lock)) { mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); #ifdef HAVE_BLKDEV_GET_ERESTARTSYS schedule(); return (SET_ERROR(-ERESTARTSYS)); #else if ((gethrtime() - start) > timeout) return (SET_ERROR(-ERESTARTSYS)); schedule_timeout(MSEC_TO_TICK(10)); goto retry; #endif } else { drop_namespace = B_TRUE; } } error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); if (drop_namespace) mutex_exit(&spa_namespace_lock); } if (error == 0) { if ((blk_mode_is_open_write(flag)) && (zv->zv_flags & ZVOL_RDONLY)) { if (zv->zv_open_count == 0) zvol_last_close(zv); error = SET_ERROR(-EROFS); } else { zv->zv_open_count++; } } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); if (error == 0) #ifdef HAVE_BLK_MODE_T disk_check_media_change(disk); #else zfs_check_media_change(bdev); #endif return (error); } static void #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG zvol_release(struct gendisk *disk) #else zvol_release(struct gendisk *disk, fmode_t unused) #endif { #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) (void) unused; #endif zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, RW_READER); zv = disk->private_data; mutex_enter(&zv->zv_state_lock); ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zv->zv_open_count--; if (zv->zv_open_count == 0) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); } static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { case BLKFLSBUF: +#ifdef HAVE_FSYNC_BDEV fsync_bdev(bdev); +#elif defined(HAVE_SYNC_BLOCKDEV) + sync_blockdev(bdev); +#else +#error "Neither fsync_bdev() nor sync_blockdev() found" +#endif invalidate_bdev(bdev); rw_enter(&zv->zv_suspend_lock, RW_READER); if (!(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); rw_exit(&zv->zv_suspend_lock); break; case BLKZNAME: mutex_enter(&zv->zv_state_lock); error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); mutex_exit(&zv->zv_state_lock); break; default: error = -ENOTTY; break; } return (SET_ERROR(error)); } #ifdef CONFIG_COMPAT static int zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { return (zvol_ioctl(bdev, mode, cmd, arg)); } #else #define zvol_compat_ioctl NULL #endif static unsigned int zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; zv->zv_changed = 0; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (mask); } static int zvol_revalidate_disk(struct gendisk *disk) { rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> SECTOR_BITS); mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (0); } int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) { struct gendisk *disk = zv->zv_zso->zvo_disk; #if defined(HAVE_REVALIDATE_DISK_SIZE) revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); #elif defined(HAVE_REVALIDATE_DISK) revalidate_disk(disk); #else zvol_revalidate_disk(disk); #endif return (0); } void zvol_os_clear_private(zvol_state_t *zv) { /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ zv->zv_zso->zvo_disk->private_data = NULL; } /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_zso->zvo_disk); if (sectors > 2048) { geo->heads = 16; geo->sectors = 63; } else { geo->heads = 2; geo->sectors = 4; } geo->start = 0; geo->cylinders = sectors / (geo->heads * geo->sectors); return (0); } /* * Why have two separate block_device_operations structs? * * Normally we'd just have one, and assign 'submit_bio' as needed. However, * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we * can't just change submit_bio dynamically at runtime. So just create two * separate structs to get around this. */ static const struct block_device_operations zvol_ops_blk_mq = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK .revalidate_disk = zvol_revalidate_disk, #endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, }; static const struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK .revalidate_disk = zvol_revalidate_disk, #endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS .submit_bio = zvol_submit_bio, #endif }; static int zvol_alloc_non_blk_mq(struct zvol_state_os *zso) { #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) #if defined(HAVE_BLK_ALLOC_DISK) zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); if (zso->zvo_disk == NULL) return (1); zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; #else zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); if (zso->zvo_queue == NULL) return (1); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); return (1); } zso->zvo_disk->queue = zso->zvo_queue; #endif /* HAVE_BLK_ALLOC_DISK */ #else zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); if (zso->zvo_queue == NULL) return (1); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); return (1); } zso->zvo_disk->queue = zso->zvo_queue; #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ return (0); } static int zvol_alloc_blk_mq(zvol_state_t *zv) { #ifdef HAVE_BLK_MQ struct zvol_state_os *zso = zv->zv_zso; /* Allocate our blk-mq tag_set */ if (zvol_blk_mq_alloc_tag_set(zv) != 0) return (1); #if defined(HAVE_BLK_ALLOC_DISK) zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); if (zso->zvo_disk == NULL) { blk_mq_free_tag_set(&zso->tag_set); return (1); } zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_disk->minors = ZVOL_MINORS; #else zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); blk_mq_free_tag_set(&zso->tag_set); return (1); } /* Allocate queue */ zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); if (IS_ERR(zso->zvo_queue)) { blk_mq_free_tag_set(&zso->tag_set); return (1); } /* Our queue is now created, assign it to our disk */ zso->zvo_disk->queue = zso->zvo_queue; #endif #endif return (0); } /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ static zvol_state_t * zvol_alloc(dev_t dev, const char *name) { zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; int ret; if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) return (NULL); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) return (NULL); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; zv->zv_volmode = volmode; list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); #ifdef HAVE_BLK_MQ zv->zv_zso->use_blk_mq = zvol_use_blk_mq; #endif /* * The block layer has 3 interfaces for getting BIOs: * * 1. blk-mq request queues (new) * 2. submit_bio() (oldest) * 3. regular request queues (old). * * Each of those interfaces has two permutations: * * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates * both the disk and its queue (5.14 kernel or newer) * * b) We don't have blk_*alloc_disk(), and have to allocate the * disk and the queue separately. (5.13 kernel or older) */ if (zv->zv_zso->use_blk_mq) { ret = zvol_alloc_blk_mq(zv); zso->zvo_disk->fops = &zvol_ops_blk_mq; } else { ret = zvol_alloc_non_blk_mq(zso); zso->zvo_disk->fops = &zvol_ops; } if (ret != 0) goto out_kmem; blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); if (!zv->zv_zso->use_blk_mq) { /* Disable write merging in favor of the ZIO pipeline. */ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); } /* Enable /proc/diskstats */ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); zso->zvo_queue->queuedata = zv; zso->zvo_dev = dev; zv->zv_open_count = 0; strlcpy(zv->zv_name, name, MAXNAMELEN); zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zso->zvo_disk->major = zvol_major; zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; /* * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. * This is accomplished by limiting the number of minors for the * device to one and explicitly disabling partition scanning. */ if (volmode == ZFS_VOLMODE_DEV) { zso->zvo_disk->minors = 1; zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; } zso->zvo_disk->first_minor = (dev & MINORMASK); zso->zvo_disk->private_data = zv; snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); return (zv); out_kmem: kmem_free(zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); return (NULL); } /* * Cleanup then free a zvol_state_t which was created by zvol_alloc(). * At this time, the structure is not opened by anyone, is taken off * the zvol_state_list, and has its private data set to NULL. * The zvol_state_lock is dropped. * * This function may take many milliseconds to complete (e.g. we've seen * it take over 256ms), due to the calls to "blk_cleanup_queue" and * "del_gendisk". Thus, consumers need to be careful to account for this * latency when calling this function. */ void zvol_os_free(zvol_state_t *zv) { ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); del_gendisk(zv->zv_zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ defined(HAVE_BLK_ALLOC_DISK) #if defined(HAVE_BLK_CLEANUP_DISK) blk_cleanup_disk(zv->zv_zso->zvo_disk); #else put_disk(zv->zv_zso->zvo_disk); #endif #else blk_cleanup_queue(zv->zv_zso->zvo_queue); put_disk(zv->zv_zso->zvo_disk); #endif #ifdef HAVE_BLK_MQ if (zv->zv_zso->use_blk_mq) blk_mq_free_tag_set(&zv->zv_zso->tag_set); #endif ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); } void zvol_wait_close(zvol_state_t *zv) { } /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block * device is live and ready for use. */ int zvol_os_create_minor(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t len; unsigned minor = 0; int error = 0; int idx; uint64_t hash = zvol_name_hash(name); bool replayed_zil = B_FALSE; if (zvol_inhibit_dev) return (0); idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); ida_simple_remove(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; zv = zvol_alloc(MKDEV(zvol_major, minor), name); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; } zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, (DMU_MAX_ACCESS / 4) >> 9); if (zv->zv_zso->use_blk_mq) { /* * IO requests can be really big (1MB). When an IO request * comes in, it is passed off to zvol_read() or zvol_write() * in a new thread, where it is chunked up into 'volblocksize' * sized pieces and processed. So for example, if the request * is a 1MB write and your volblocksize is 128k, one zvol_write * thread will take that request and sequentially do ten 128k * IOs. This is due to the fact that the thread needs to lock * each volblocksize sized block. So you might be wondering: * "instead of passing the whole 1MB request to one thread, * why not pass ten individual 128k chunks to ten threads and * process the whole write in parallel?" The short answer is * that there's a sweet spot number of chunks that balances * the greater parallelism with the added overhead of more * threads. The sweet spot can be different depending on if you * have a read or write heavy workload. Writes typically want * high chunk counts while reads typically want lower ones. On * a test pool with 6 NVMe drives in a 3x 2-disk mirror * configuration, with volblocksize=8k, the sweet spot for good * sequential reads and writes was at 8 chunks. */ /* * Below we tell the kernel how big we want our requests * to be. You would think that blk_queue_io_opt() would be * used to do this since it is used to "set optimal request * size for the queue", but that doesn't seem to do * anything - the kernel still gives you huge requests * with tons of little PAGE_SIZE segments contained within it. * * Knowing that the kernel will just give you PAGE_SIZE segments * no matter what, you can say "ok, I want PAGE_SIZE byte * segments, and I want 'N' of them per request", where N is * the correct number of segments for the volblocksize and * number of chunks you want. */ #ifdef HAVE_BLK_MQ if (zvol_blk_mq_blocks_per_thread != 0) { unsigned int chunks; chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); blk_queue_max_segment_size(zv->zv_zso->zvo_queue, PAGE_SIZE); blk_queue_max_segments(zv->zv_zso->zvo_queue, (zv->zv_volblocksize * chunks) / PAGE_SIZE); } else { /* * Special case: zvol_blk_mq_blocks_per_thread = 0 * Max everything out. */ blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); } #endif } else { blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); } blk_queue_physical_block_size(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); blk_queue_discard_granularity(zv->zv_zso->zvo_queue, zv->zv_volblocksize); #ifdef QUEUE_FLAG_DISCARD blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_NONROT blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_ADD_RANDOM blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); #endif /* This flag was introduced in kernel version 4.12. */ #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); #endif ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); if (error) goto out_dmu_objset_disown; ASSERT3P(zv->zv_zilog, ==, NULL); zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); else replayed_zil = zil_replay(os, zv, zvol_replay_vector); } if (replayed_zil) zil_close(zv->zv_zilog); zv->zv_zilog = NULL; /* * When udev detects the addition of the device it will immediately * invoke blkid(8) to determine the type of content on the device. * Prefetching the blocks commonly scanned by blkid(8) will speed * up this process. */ len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); /* * Keep in mind that once add_disk() is called, the zvol is * announced to the world, and zvol_open()/zvol_release() can * be called at any time. Incidentally, add_disk() itself calls * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() * directly as well. */ if (error == 0) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); #ifdef HAVE_ADD_DISK_RET error = add_disk(zv->zv_zso->zvo_disk); #else add_disk(zv->zv_zso->zvo_disk); #endif } else { ida_simple_remove(&zvol_ida, idx); } return (error); } void zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_zso->zvo_disk); ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(zv->zv_name); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); /* * The block device's read-only state is briefly changed causing * a KOBJ_CHANGE uevent to be issued. This ensures udev detects * the name change and fixes the symlinks. This does not change * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never * changes. This would normally be done using kobject_uevent() but * that is a GPL-only symbol which is why we need this workaround. */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); } void zvol_os_set_disk_ro(zvol_state_t *zv, int flags) { set_disk_ro(zv->zv_zso->zvo_disk, flags); } void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) { set_capacity(zv->zv_zso->zvo_disk, capacity); } int zvol_init(void) { int error; /* * zvol_threads is the module param the user passes in. * * zvol_actual_threads is what we use internally, since the user can * pass zvol_thread = 0 to mean "use all the CPUs" (the default). */ static unsigned int zvol_actual_threads; if (zvol_threads == 0) { /* * See dde9380a1 for why 32 was chosen here. This should * probably be refined to be some multiple of the number * of CPUs. */ zvol_actual_threads = MAX(num_online_cpus(), 32); } else { zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); } error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } #ifdef HAVE_BLK_MQ if (zvol_blk_mq_queue_depth == 0) { zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; } else { zvol_actual_blk_mq_queue_depth = MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); } if (zvol_blk_mq_threads == 0) { zvol_blk_mq_actual_threads = num_online_cpus(); } else { zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1024); } #endif zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (zvol_taskq == NULL) { unregister_blkdev(zvol_major, ZVOL_DRIVER); return (-ENOMEM); } zvol_init_impl(); ida_init(&zvol_ida); return (0); } void zvol_fini(void) { zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); taskq_destroy(zvol_taskq); ida_destroy(&zvol_ida); } /* BEGIN CSTYLED */ module_param(zvol_inhibit_dev, uint, 0644); MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" "to 0 to use all active CPUs"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); +#ifdef HAVE_BLK_MQ +module_param(zvol_blk_mq_queue_depth, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); + +module_param(zvol_use_blk_mq, uint, 0644); +MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); + +module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); +MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, + "Process volblocksize blocks per thread"); +#endif + #ifndef HAVE_BLKDEV_GET_ERESTARTSYS module_param(zvol_open_timeout_ms, uint, 0644); MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); #endif /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index b5946e7604c0..dfea15b74394 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -1,10722 +1,10778 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2020, Delphix. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020, The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (ranging from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the * metadata limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so * they can be reclaimed. For example, when using the ZPL each dentry * holds a references on a znode. These dentries must be pruned before * the arc buffer holding the znode can be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ /* * ARC operation: * * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. * This structure can point either to a block that is still in the cache or to * one that is only accessible in an L2 ARC device, or it can provide * information about a block that was recently evicted. If a block is * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough * information to retrieve it from the L2ARC device. This information is * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block * that is in this state cannot access the data directly. * * Blocks that are actively being referenced or have not been evicted * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the * ability to store the physical data (b_pabd) associated with the DVA of the * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC * consumer. The ARC will provide references to this data and will keep it * cached until it is no longer in use. The ARC caches only the L1ARC's physical * data block and will evict any arc_buf_t that is no longer referenced. The * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the * "overhead_size" kstat. * * Depending on the consumer, an arc_buf_t can be requested in uncompressed or * compressed form. The typical case is that consumers will want uncompressed * data, and when that happens a new data buffer is allocated where the data is * decompressed for them to use. Currently the only consumer who wants * compressed arc_buf_t's is "zfs send", when it streams data exactly as it * exists on disk. When this happens, the arc_buf_t's data buffer is shared * with the arc_buf_hdr_t. * * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The * first one is owned by a compressed send consumer (and therefore references * the same compressed data buffer as the arc_buf_hdr_t) and the second could be * used by any other consumer (and has its own uncompressed copy of the data * buffer). * * arc_buf_hdr_t * +-----------+ * | fields | * | common to | * | L1- and | * | L2ARC | * +-----------+ * | l2arc_buf_hdr_t * | | * +-----------+ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | * | +-----------+ | |b_data +-+ * +->+------+ | +-----------+ | * compressed | | | | * data | |<--------------+ | uncompressed * +------+ compressed, | data * shared +-->+------+ * data | | * | | * +------+ * * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be * the last buffer in the hdr's b_buf list, however a shared compressed buf can * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is * sharing its data with an arc_buf_t (note that the shared uncompressed buf is * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ * | | * | | * | | * +-----------+ * l2arc_buf_hdr_t| | * | | * +-----------+ * l1arc_buf_hdr_t| | * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ * +->+------+ | +---------+ | * | | | | * uncompressed | | | | * data +------+ | | * ^ +->+------+ | * | uncompressed | | | * | data | | | * | +------+ | * +---------------------------------+ * * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will memcpy the transformed on-disk block into * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pabd. The * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. * * The L1ARC has a slightly different system for storing encrypted data. * Raw (encrypted + possibly compressed) data has a few subtle differences from * data that is just compressed. The biggest difference is that it is not * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded. * The other difference is that encryption cannot be treated as a suggestion. * If a caller would prefer compressed data, but they actually wind up with * uncompressed data the worst thing that could happen is there might be a * performance hit. If the caller requests encrypted data, however, we must be * sure they actually get it or else secret information could be leaked. Raw * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore, * may have both an encrypted version and a decrypted version of its data at * once. When a caller needs a raw arc_buf_t, it is allocated and the data is * copied out of this header. To avoid complications with b_pabd, raw buffers * cannot be shared. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; #endif /* * This thread's job is to keep enough free memory in the system, by * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves * arc_available_memory(). */ static zthr_t *arc_reap_zthr; /* * This thread's job is to keep arc_size under arc_c, by calling * arc_evict(), which improves arc_is_overflowing(). */ static zthr_t *arc_evict_zthr; static arc_buf_hdr_t **arc_state_evict_markers; static int arc_state_evict_marker_count; static kmutex_t arc_evict_lock; static boolean_t arc_evict_needed = B_FALSE; static clock_t arc_last_uncached_flush; /* * Count of bytes evicted since boot. */ static uint64_t arc_evict_count; /* * List of arc_evict_waiter_t's, representing threads waiting for the * arc_evict_count to reach specific values. */ static list_t arc_evict_waiters; /* * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of * the requested amount of data to be evicted. For example, by default for * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation. * Since this is above 100%, it ensures that progress is made towards getting * arc_size under arc_c. Since this is finite, it ensures that allocations * can still happen, even during the potentially long time that arc_size is * more than arc_c. */ static uint_t zfs_arc_eviction_pct = 200; /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower * value means we're more likely to evict the "correct" header (i.e. the * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ static uint_t zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ uint_t arc_grow_retry = 5; /* * Minimum time between calls to arc_kmem_reap_soon(). */ static const int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ static int zfs_arc_overflow_shift = 8; /* log2(fraction of arc to reclaim) */ uint_t arc_shrink_shift = 7; /* percent of pagecache to reclaim arc to */ #ifdef _KERNEL uint_t zfs_arc_pc_percent = 0; #endif /* * log2(fraction of ARC which must be free to allow growing). * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ uint_t arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static uint_t arc_min_prefetch_ms; static uint_t arc_min_prescient_prefetch_ms; /* * If this percent of memory is free, don't throttle. */ uint_t arc_lotsfree_percent = 10; /* * The arc has filled available memory and has now warmed up. */ boolean_t arc_warm; /* * These tunables are for performance analysis. */ uint64_t zfs_arc_max = 0; uint64_t zfs_arc_min = 0; static uint64_t zfs_arc_dnode_limit = 0; static uint_t zfs_arc_dnode_reduce_percent = 10; static uint_t zfs_arc_grow_retry = 0; static uint_t zfs_arc_shrink_shift = 0; uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* * ARC dirty data constraints for arc_tempreserve_space() throttle: * * total dirty data limit * * anon block dirty limit * * each pool's anon allowance */ static const unsigned long zfs_arc_dirty_limit_percent = 50; static const unsigned long zfs_arc_anon_limit_percent = 25; static const unsigned long zfs_arc_pool_dirty_percent = 20; /* * Enable or disable compressed arc buffers. */ int zfs_compressed_arc_enabled = B_TRUE; /* * Balance between metadata and data on ghost hits. Values above 100 * increase metadata caching by proportionally reducing effect of ghost * data hits on target data/metadata rate. */ static uint_t zfs_arc_meta_balance = 500; /* * Percentage that can be consumed by dnodes of ARC meta buffers. */ static uint_t zfs_arc_dnode_limit_percent = 10; /* * These tunables are Linux-specific */ static uint64_t zfs_arc_sys_free = 0; static uint_t zfs_arc_min_prefetch_ms = 0; static uint_t zfs_arc_min_prescient_prefetch_ms = 0; static uint_t zfs_arc_lotsfree_percent = 10; /* * Number of arc_prune threads */ static int zfs_arc_prune_task_threads = 1; /* The 7 states: */ arc_state_t ARC_anon; arc_state_t ARC_mru; arc_state_t ARC_mru_ghost; arc_state_t ARC_mfu; arc_state_t ARC_mfu_ghost; arc_state_t ARC_l2c_only; arc_state_t ARC_uncached; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "iohits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_iohits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_iohits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_iohits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_iohits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "uncached_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "access_skip", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "meta", KSTAT_DATA_UINT64 }, { "pd", KSTAT_DATA_UINT64 }, { "pm", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "compressed_size", KSTAT_DATA_UINT64 }, { "uncompressed_size", KSTAT_DATA_UINT64 }, { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, { "dbuf_size", KSTAT_DATA_UINT64 }, { "dnode_size", KSTAT_DATA_UINT64 }, { "bonus_size", KSTAT_DATA_UINT64 }, #if defined(COMPAT_FREEBSD11) { "other_size", KSTAT_DATA_UINT64 }, #endif { "anon_size", KSTAT_DATA_UINT64 }, { "anon_data", KSTAT_DATA_UINT64 }, { "anon_metadata", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, { "mru_data", KSTAT_DATA_UINT64 }, { "mru_metadata", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, { "mru_ghost_data", KSTAT_DATA_UINT64 }, { "mru_ghost_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, { "mfu_data", KSTAT_DATA_UINT64 }, { "mfu_metadata", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "uncached_size", KSTAT_DATA_UINT64 }, { "uncached_data", KSTAT_DATA_UINT64 }, { "uncached_metadata", KSTAT_DATA_UINT64 }, { "uncached_evictable_data", KSTAT_DATA_UINT64 }, { "uncached_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, { "l2_mru_asize", KSTAT_DATA_UINT64 }, { "l2_mfu_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_data_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_count", KSTAT_DATA_UINT64 }, { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, { "l2_rebuild_success", KSTAT_DATA_UINT64 }, { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, { "l2_rebuild_size", KSTAT_DATA_UINT64 }, { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "memory_all_bytes", KSTAT_DATA_UINT64 }, { "memory_free_bytes", KSTAT_DATA_UINT64 }, { "memory_available_bytes", KSTAT_DATA_INT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, { "arc_tempreserve", KSTAT_DATA_UINT64 }, { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "prescient_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, { "arc_raw_size", KSTAT_DATA_UINT64 }, { "cached_only_in_progress", KSTAT_DATA_UINT64 }, { "abd_chunk_waste_size", KSTAT_DATA_UINT64 }, }; arc_sums_t arc_sums; #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } /* * This macro allows us to use kstats as floating averages. Each time we * update this kstat, we first factor it and the update value by * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall * average. This macro assumes that integer loads and stores are atomic, but * is not safe for multiple writers updating the kstat in parallel (only the * last writer's update will remain). */ #define ARCSTAT_F_AVG_FACTOR 3 #define ARCSTAT_F_AVG(stat, value) \ do { \ uint64_t x = ARCSTAT(stat); \ x = x - x / ARCSTAT_F_AVG_FACTOR + \ (value) / ARCSTAT_F_AVG_FACTOR; \ ARCSTAT(stat) = x; \ } while (0) static kstat_t *arc_ksp; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ hrtime_t arc_growtime; list_t arc_prune_list; kmutex_t arc_prune_mtx; taskq_t *arc_prune_taskq; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_PRESCIENT_PREFETCH(hdr) \ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) #define HDR_UNCACHED(hdr) ((hdr)->b_flags & ARC_FLAG_UNCACHED) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) #define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED) #define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) #define HDR_HAS_RABD(hdr) \ (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \ (hdr)->b_crypt_hdr.b_rabd != NULL) #define HDR_ENCRYPTED(hdr) \ (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) #define HDR_AUTHENTICATED(hdr) \ (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) #define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED) /* * Other sizes */ #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ #define BUF_LOCKS 2048 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ /* * We can feed L2ARC from two states of ARC buffers, mru and mfu, * and each of the state has two types: data and metadata. */ #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ static uint_t l2arc_meta_percent = 33; /* limit on headers size */ /* * L2ARC Internals */ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ abd_t *l2rcb_abd; /* temporary buffer */ } l2arc_read_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; typedef enum arc_fill_flags { ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */ ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */ ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ } arc_fill_flags_t; typedef enum arc_ovf_level { ARC_OVF_NONE, /* ARC within target size. */ ARC_OVF_SOME, /* ARC is slightly overflowed. */ ARC_OVF_SEVERE /* ARC is severely overflowed. */ } arc_ovf_level_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static kmutex_t l2arc_rebuild_thr_lock; static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, ARC_HDR_USE_RESERVE = 0x4, ARC_HDR_ALLOC_LINEAR = 0x8, }; static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *); static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); static void arc_hdr_destroy(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t); static void arc_buf_watch(arc_buf_t *); static void arc_change_state(arc_state_t *, arc_buf_hdr_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only); +static void arc_prune_async(uint64_t adjust); + #define l2arc_hdr_arcstats_increment(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) #define l2arc_hdr_arcstats_decrement(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE) #define l2arc_hdr_arcstats_increment_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE) #define l2arc_hdr_arcstats_decrement_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) /* * l2arc_exclude_special : A zfs module parameter that controls whether buffers * present on special vdevs are eligibile for caching in L2ARC. If * set to 1, exclude dbufs on special vdevs from being cached to * L2ARC. */ int l2arc_exclude_special = 0; /* * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU * metadata and data are cached from ARC into L2ARC. */ static int l2arc_mfuonly = 0; /* * L2ARC TRIM * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of * the current write size (l2arc_write_max) we should TRIM if we * have filled the device. It is defined as a percentage of the * write size. If set to 100 we trim twice the space required to * accommodate upcoming writes. A minimum of 64MB will be trimmed. * It also enables TRIM of the whole L2ARC device upon creation or * addition to an existing pool or if the header of the device is * invalid upon importing a pool or onlining a cache device. The * default is 0, which disables TRIM on L2ARC altogether as it can * put significant stress on the underlying storage devices. This * will vary depending of how well the specific device handles * these commands. */ static uint64_t l2arc_trim_ahead = 0; /* * Performance tuning of L2ARC persistence: * * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding * an L2ARC device (either at pool import or later) will attempt * to rebuild L2ARC buffer contents. * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls * whether log blocks are written to the L2ARC device. If the L2ARC * device is less than 1GB, the amount of data l2arc_evict() * evicts is significant compared to the amount of restored L2ARC * data. In this case do not write log blocks in L2ARC in order * not to waste space. */ static int l2arc_rebuild_enabled = B_TRUE; static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; /* L2ARC persistence rebuild control routines. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg); static int l2arc_rebuild(l2arc_dev_t *dev); /* L2ARC persistence read I/O routines. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev); static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io); static zio_t *l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab); boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check); static void l2arc_blk_fetch_done(zio_t *zio); static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); } #define HDR_EMPTY(hdr) \ ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) #define HDR_EMPTY_OR_LOCKED(hdr) \ (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr))) #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fhdr; uint32_t i; ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (lockp != NULL) { *lockp = hash_lock; mutex_enter(hash_lock); } else { ASSERT(MUTEX_HELD(hash_lock)); } for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } uint64_t he = atomic_inc_64_nv( &arc_stats.arcstat_hash_elements.value.ui64); ARCSTAT_MAX(arcstat_hash_elements_max, he); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *hdr) { arc_buf_hdr_t *fhdr, **hdrp; uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_full_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_free() in the linux kernel\ */ vmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #endif for (int i = 0; i < BUF_LOCKS; i++) mutex_destroy(BUF_HASH_LOCK(i)); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; memset(hdr, 0, HDR_FULL_SIZE); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); #endif multilist_link_init(&hdr->b_l1hdr.b_arc_node); list_link_init(&hdr->b_l2hdr.b_l2node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; memset(hdr, 0, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } static int buf_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_t *buf = vbuf; memset(buf, 0, sizeof (arc_buf_t)); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ static void hdr_full_dest(void *vbuf, void *unused) { (void) unused; arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); #endif ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } static void hdr_l2only_dest(void *vbuf, void *unused) { (void) unused; arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } static void buf_dest(void *vbuf, void *unused) { (void) unused; (void) vbuf; arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } static void buf_init(void) { uint64_t *ct = NULL; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < arc_all_memory()) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_alloc() in the linux kernel */ buf_hash_table.ht_table = vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); #else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); #endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL); } #define ARC_MINTIME (hz>>4) /* 62 ms */ /* * This is the size that the buf occupies in memory. If the buf is compressed, * it will correspond to the compressed size. You should use this method of * getting the buf size unless you explicitly need the logical size. */ uint64_t arc_buf_size(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); } uint64_t arc_buf_lsize(arc_buf_t *buf) { return (HDR_GET_LSIZE(buf->b_hdr)); } /* * This function will return B_TRUE if the buffer is encrypted in memory. * This buffer can be decrypted by calling arc_untransform(). */ boolean_t arc_is_encrypted(arc_buf_t *buf) { return (ARC_BUF_ENCRYPTED(buf) != 0); } /* * Returns B_TRUE if the buffer represents data that has not had its MAC * verified yet. */ boolean_t arc_is_unauthenticated(arc_buf_t *buf) { return (HDR_NOAUTH(buf->b_hdr) != 0); } void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, uint8_t *iv, uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_PROTECTED(hdr)); memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; } /* * Indicates how this buffer is compressed in memory. If it is not compressed * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with * arc_untransform() as long as it is also unencrypted. */ enum zio_compress arc_get_compression(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); } /* * Return the compression algorithm used to store this data in the ARC. If ARC * compression is enabled or this is an encrypted block, this will be the same * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF. */ static inline enum zio_compress arc_hdr_get_compress(arc_buf_hdr_t *hdr) { return (HDR_COMPRESSION_ENABLED(hdr) ? HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); } uint8_t arc_get_complevel(arc_buf_t *buf) { return (buf->b_hdr->b_complevel); } static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); - IMPLY(shared, ARC_BUF_SHARED(buf)); + EQUIV(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* * It would be nice to assert arc_can_share() too, but the "hdr isn't * already being shared" requirement prevents us from doing that. */ return (shared); } /* * Free the checksum associated with this header. If there is no checksum, this * is a no-op. */ static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { #ifdef ZFS_DEBUG ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_l1hdr.b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif } /* * Return true iff at least one of the bufs on hdr is not compressed. * Encrypted buffers count as compressed. */ static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) { ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr)); for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { if (!ARC_BUF_COMPRESSED(b)) { return (B_TRUE); } } return (B_FALSE); } /* * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data * matches the checksum that is stored in the hdr. If there is no checksum, * or if the buf is compressed, this is a no-op. */ static void arc_cksum_verify(arc_buf_t *buf) { #ifdef ZFS_DEBUG arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif } /* * This function makes the assumption that data stored in the L2ARC * will be transformed exactly as it is in the main pool. Because of * this we can verify the checksum against the reading process's bp. */ static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); /* * Block pointers always store the checksum for the logical data. * If the block pointer has the gang bit set, then the checksum * it represents is for the reconstituted data and not for an * individual gang member. The zio pipeline, however, must be able to * determine the checksum of each of the gang constituents so it * treats the checksum comparison differently than what we need * for l2arc blocks. This prevents us from using the * zio_checksum_error() interface directly. Instead we must call the * zio_checksum_error_impl() so that we can ensure the checksum is * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ return (zio_checksum_error_impl(zio->io_spa, zio->io_bp, BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); } /* * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a * checksum and attaches it to the buf's hdr so that we can ensure that the buf * isn't modified later on. If buf is compressed or there is already a checksum * on the hdr, this is a no-op (we only checksum uncompressed bufs). */ static void arc_cksum_compute(arc_buf_t *buf) { if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; #ifdef ZFS_DEBUG arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif arc_buf_watch(buf); } #ifndef _KERNEL void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { (void) sig, (void) unused; panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr); } #endif static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ | PROT_WRITE)); } #else (void) buf; #endif } static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ)); #else (void) buf; #endif } static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; } VERIFY3U(hdr->b_type, ==, type); return (type); } boolean_t arc_is_metadata(arc_buf_t *buf) { return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); } static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { switch (type) { case ARC_BUFC_DATA: /* metadata field is 0 if buffer contains normal data */ return (0); case ARC_BUFC_METADATA: return (ARC_FLAG_BUFC_METADATA); default: break; } panic("undefined ARC buffer type!"); return ((uint32_t)-1); } void arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_cksum_verify(buf); /* * Compressed buffers do not manipulate the b_freeze_cksum. */ if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); arc_buf_unwatch(buf); } void arc_buf_freeze(arc_buf_t *buf) { if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(buf->b_hdr)); arc_cksum_compute(buf); } /* * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, * the following functions should be used to ensure that the flags are * updated in a thread-safe way. When manipulating the flags either * the hash_lock must be held or the hdr must be undiscoverable. This * ensures that we're not racing with any other threads when updating * the flags. */ static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags &= ~flags; } /* * Setting the compression bits in the arc_buf_hdr_t's b_flags is * done in a special way since we have to clear and set bits * at the same time. Consumers that wish to set the compression bits * must use this function to ensure that the flags are updated in * thread-safe manner. */ static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so * we ignore the compression of the blkptr and set the * want to uncompress them. Mark them as uncompressed. */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } HDR_SET_COMPRESS(hdr, cmp); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); } /* * Looks for another buf on the same hdr which has the data decompressed, copies * from it, and returns true. If no such buf exists, returns false. */ static boolean_t arc_buf_try_copy_decompressed_data(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t copied = B_FALSE; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(buf->b_data, !=, NULL); ASSERT(!ARC_BUF_COMPRESSED(buf)); for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; from = from->b_next) { /* can't use our own data buffer */ if (from == buf) { continue; } if (!ARC_BUF_COMPRESSED(from)) { memcpy(buf->b_data, from->b_data, arc_buf_size(buf)); copied = B_TRUE; break; } } #ifdef ZFS_DEBUG /* * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ if (zfs_flags & ZFS_DEBUG_MODIFY) EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); #endif return (copied); } /* * Allocates an ARC buf header that's in an evicted & L2-cached state. * This is used during l2arc reconstruction to make empty ARC buffers * which circumvent the regular disk->arc->l2arc path and instead come * into being in the reverse order, i.e. l2arc->arc. */ static arc_buf_hdr_t * arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, enum zio_compress compress, uint8_t complevel, boolean_t protected, boolean_t prefetch, arc_state_type_t arcs_state) { arc_buf_hdr_t *hdr; ASSERT(size != 0); hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); hdr->b_birth = birth; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); HDR_SET_LSIZE(hdr, size); HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); if (prefetch) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); hdr->b_dva = dva; hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = daddr; hdr->b_l2hdr.b_arcs_state = arcs_state; return (hdr); } /* * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) { uint64_t size; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && HDR_GET_PSIZE(hdr) > 0) { size = HDR_GET_PSIZE(hdr); } else { ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); size = HDR_GET_LSIZE(hdr); } return (size); } static int arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) { int ret; uint64_t csize; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); void *tmpbuf = NULL; abd_t *abd = hdr->b_l1hdr.b_pabd; ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * The MAC is calculated on the compressed data that is stored on disk. * However, if compressed arc is disabled we will only have the * decompressed data available to us now. Compress it into a temporary * abd so we can verify the MAC. The performance overhead of this will * be relatively low, since most objects in an encrypted objset will * be encrypted (instead of authenticated) anyway. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { csize = zio_compress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel); ASSERT3P(tmpbuf, !=, NULL); ASSERT3U(csize, <=, psize); abd = abd_get_from_buf(tmpbuf, lsize); abd_take_ownership_of_buf(abd, B_TRUE); abd_zero_off(abd, csize, psize - csize); } /* * Authentication is best effort. We authenticate whenever the key is * available. If we succeed we clear ARC_FLAG_NOAUTH. */ if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); ASSERT3U(lsize, ==, psize); ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); } else { ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_crypt_hdr.b_mac); } if (ret == 0) arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); else if (ret != ENOENT) goto error; if (tmpbuf != NULL) abd_free(abd); return (0); error: if (tmpbuf != NULL) abd_free(abd); return (ret); } /* * This function will take a header that only has raw encrypted data in * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in * b_l1hdr.b_pabd. If designated in the header flags, this function will * also decompress the data. */ static int arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) { int ret; abd_t *cabd = NULL; void *tmp = NULL; boolean_t no_crypt = B_FALSE; boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); arc_hdr_alloc_abd(hdr, 0); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) { abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); } /* * If this header has disabled arc compression but the b_pabd is * compressed after decrypting it, we need to decompress the newly * decrypted data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { /* * We want to make sure that we are correctly honoring the * zfs_abd_scatter_enabled setting, so we allocate an abd here * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; } return (0); error: arc_hdr_free_abd(hdr, B_FALSE); if (cabd != NULL) arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); return (ret); } /* * This function is called during arc_buf_fill() to prepare the header's * abd plaintext pointer for use. This involves authenticated protected * data and decrypting encrypted data into the plaintext abd. */ static int arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa, const zbookmark_phys_t *zb, boolean_t noauth) { int ret; ASSERT(HDR_PROTECTED(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); if (HDR_NOAUTH(hdr) && !noauth) { /* * The caller requested authenticated data but our data has * not been authenticated yet. Verify the MAC now if we can. */ ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset); if (ret != 0) goto error; } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) { /* * If we only have the encrypted version of the data, but the * unencrypted version was requested we take this opportunity * to store the decrypted version in the header for future use. */ ret = arc_hdr_decrypt(hdr, spa, zb); if (ret != 0) goto error; } ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); if (hash_lock != NULL) mutex_exit(hash_lock); return (0); error: if (hash_lock != NULL) mutex_exit(hash_lock); return (ret); } /* * This function is used by the dbuf code to decrypt bonus buffers in place. * The dbuf code itself doesn't have any locking for decrypting a shared dnode * block, so we use the hash lock here to protect against concurrent calls to * arc_buf_fill(). */ static void arc_buf_untransform_in_place(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } /* * Given a buf that has a data buffer attached to it, this function will * efficiently fill the buf with data of the specified compression setting from * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr * are already sharing a data buf, no copy is performed. * * If the buf is marked as compressed but uncompressed data was requested, this * will allocate a new data buffer for the buf, remove that flag, and fill the * buf with uncompressed data. You can't request a compressed buf on a hdr with * uncompressed data, and (since we haven't added support for it yet) if you * want compressed data your buf must already be marked as compressed and have * the correct-sized data buffer. */ static int arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, arc_fill_flags_t flags) { int error = 0; arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t hdr_compressed = (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0; boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr); ASSERT3P(buf->b_data, !=, NULL); IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf)); IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, HDR_ENCRYPTED(hdr)); IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); - IMPLY(encrypted, !ARC_BUF_SHARED(buf)); + IMPLY(encrypted, !arc_buf_is_shared(buf)); /* * If the caller wanted encrypted data we just need to copy it from * b_rabd and potentially byteswap it. We won't be able to do any * further transforms on it. */ if (encrypted) { ASSERT(HDR_HAS_RABD(hdr)); abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); goto byteswap; } /* * Adjust encrypted and authenticated headers to accommodate * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are * allowed to fail decryption due to keys not being loaded * without being marked as an IO error. */ if (HDR_PROTECTED(hdr)) { error = arc_fill_hdr_crypt(hdr, hash_lock, spa, zb, !!(flags & ARC_FILL_NOAUTH)); if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) { return (error); } else if (error != 0) { if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (error); } } /* * There is a special case here for dnode blocks which are * decrypting their bonus buffers. These blocks may request to * be decrypted in-place. This is necessary because there may * be many dnodes pointing into this buffer and there is * currently no method to synchronize replacing the backing * b_data buffer and updating all of the pointers. Here we use * the hash lock to ensure there are no races. If the need * arises for other types to be decrypted in-place, they must * add handling here as well. */ if ((flags & ARC_FILL_IN_PLACE) != 0) { ASSERT(!hdr_compressed); ASSERT(!compressed); ASSERT(!encrypted); if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); if (hash_lock != NULL) mutex_enter(hash_lock); arc_buf_untransform_in_place(buf); if (hash_lock != NULL) mutex_exit(hash_lock); /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); } return (0); } if (hdr_compressed == compressed) { - if (!arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { + ASSERT(arc_buf_is_shared(buf)); + } else { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { ASSERT(hdr_compressed); ASSERT(!compressed); /* * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(!arc_buf_is_shared(buf)); + /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); /* We increased the size of b_data; update overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); } /* * Regardless of the buf's previous compression settings, it * should not be compressed at the end of this function. */ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; /* * Try copying the data from another buf which already has a * decompressed version. If that's not possible, it's time to * bite the bullet and decompress the data from the hdr. */ if (arc_buf_try_copy_decompressed_data(buf)) { /* Skip byteswapping and checksumming (already done) */ return (0); } else { error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); /* * Absent hardware errors or software bugs, this should * be impossible, but log it anyway so we can debug it. */ if (error != 0) { zfs_dbgmsg( "hdr %px, compress %d, psize %d, lsize %d", hdr, arc_hdr_get_compress(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (SET_ERROR(EIO)); } } } byteswap: /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); return (0); } /* * If this function is being called to decrypt an encrypted buffer or verify an * authenticated one, the key must be loaded and a mapping must be made * available in the keystore via spa_keystore_create_mapping() or one of its * callers. */ int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, boolean_t in_place) { int ret; arc_fill_flags_t flags = 0; if (in_place) flags |= ARC_FILL_IN_PLACE; ret = arc_buf_fill(buf, spa, zb, flags); if (ret == ECKSUM) { /* * Convert authentication and decryption errors to EIO * (and generate an ereport) before leaving the ARC. */ ret = SET_ERROR(EIO); spa_log_error(spa, zb, &buf->b_hdr->b_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } return (ret); } /* * Increment the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Decrement the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Add a reference to this hdr indicating that someone is actively * referencing that memory. When the refcount transitions from 0 to 1, * we remove it from the respective arc_state_t list to indicate that * it is not evictable. */ static void add_reference(arc_buf_hdr_t *hdr, const void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && state != arc_anon && state != arc_l2c_only) { /* We don't use the L2-only state list. */ multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_decrement(hdr, state); } } /* * Remove a reference from this hdr. When the reference transitions from * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's * list making it eligible for eviction. */ static int remove_reference(arc_buf_hdr_t *hdr, const void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(!GHOST_STATE(state)); /* arc_l2c_only counts as a ghost. */ if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0) return (cnt); if (state == arc_anon) { arc_hdr_destroy(hdr); return (0); } if (state == arc_uncached && !HDR_PREFETCH(hdr)) { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); return (0); } multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_increment(hdr, state); return (0); } /* * Returns detailed information about a specific arc buffer. When the * state_index argument is set the function will calculate the arc header * list position for its arc state. Since this requires a linear traversal * callers are strongly encourage not to do this. However, it can be helpful * for targeted analysis so the functionality is provided. */ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { (void) state_index; arc_buf_hdr_t *hdr = ab->b_hdr; l1arc_buf_hdr_t *l1hdr = NULL; l2arc_buf_hdr_t *l2hdr = NULL; arc_state_t *state = NULL; memset(abi, 0, sizeof (arc_buf_info_t)); if (hdr == NULL) return; abi->abi_flags = hdr->b_flags; if (HDR_HAS_L1HDR(hdr)) { l1hdr = &hdr->b_l1hdr; state = l1hdr->b_state; } if (HDR_HAS_L2HDR(hdr)) l2hdr = &hdr->b_l2hdr; if (l1hdr) { abi->abi_bufcnt = 0; for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next) abi->abi_bufcnt++; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; abi->abi_mfu_hits = l1hdr->b_mfu_hits; abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt); } if (l2hdr) { abi->abi_l2arc_dattr = l2hdr->b_daddr; abi->abi_l2arc_hits = l2hdr->b_hits; } abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; abi->abi_state_contents = arc_buf_type(hdr); abi->abi_size = arc_hdr_size(hdr); } /* * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) { arc_state_t *old_state; int64_t refcnt; boolean_t update_old, update_new; arc_buf_contents_t type = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() * in arc_read() when bringing a buffer out of the L2ARC. However, the * L1 hdr doesn't always exist when we change state to arc_anon before * destroying a header, in which case reallocating to add the L1 hdr is * pointless. */ if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); update_old = (hdr->b_l1hdr.b_buf != NULL || hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL); IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL); IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL || ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); } else { old_state = arc_l2c_only; refcnt = 0; update_old = B_FALSE; } update_new = update_old; if (GHOST_STATE(old_state)) update_old = B_TRUE; if (GHOST_STATE(new_state)) update_new = B_TRUE; ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(new_state, !=, old_state); /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); /* remove_reference() saves on insert. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { multilist_remove(&old_state->arcs_list[type], hdr); arc_evictable_space_decrement(hdr, old_state); } } if (new_state != arc_anon && new_state != arc_l2c_only) { /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or * anonymous), we realloc the header to add an L1hdr * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[type], hdr); arc_evictable_space_increment(hdr, new_state); } } ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { /* * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have no arc * buffer to use for the reference. As a result, we * use the arc header pointer for the reference. */ (void) zfs_refcount_add_many( &new_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many( &new_state->arcs_size[type], arc_buf_size(buf), buf); } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( &new_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many( &new_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } } if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * When moving a header off of a ghost state, * the header will not contain any arc buffers. * We use the arc header pointer for the reference * which is exactly what we did when we put the * header on the ghost state. */ (void) zfs_refcount_remove_many( &old_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); } else { /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many( &old_state->arcs_size[type], arc_buf_size(buf), buf); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many( &old_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many( &old_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } } if (HDR_HAS_L1HDR(hdr)) { hdr->b_l1hdr.b_state = new_state; if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) { l2arc_hdr_arcstats_decrement_state(hdr); hdr->b_l2hdr.b_arcs_state = new_state->arcs_state; l2arc_hdr_arcstats_increment_state(hdr); } } } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: ARCSTAT_INCR(arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, space); break; case ARC_SPACE_ABD_CHUNK_WASTE: /* * Note: this includes space wasted by all scatter ABD's, not * just those allocated by the ARC. But the vast majority of * scatter ABD's come from the ARC, because other users are * very short-lived. */ ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) ARCSTAT_INCR(arcstat_meta_used, space); aggsum_add(&arc_sums.arcstat_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, -space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: ARCSTAT_INCR(arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space); break; case ARC_SPACE_ABD_CHUNK_WASTE: ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) ARCSTAT_INCR(arcstat_meta_used, -space); ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0); aggsum_add(&arc_sums.arcstat_size, -space); } /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) { /* * The criteria for sharing a hdr's data are: * 1. the buffer is not encrypted * 2. the hdr's compression matches the buf's compression * 3. the hdr doesn't need to be byteswapped * 4. the hdr isn't already being shared * 5. the buf is either compressed or it is the last buf in the hdr list * * Criterion #5 maintains the invariant that shared uncompressed * bufs must be the final buf in the hdr's b_buf list. Reading this, you * might ask, "if a compressed buf is allocated first, won't that be the * last thing in the list?", but in that case it's impossible to create * a shared uncompressed buf anyway (because the hdr must be compressed * to have the compressed buf). You might also think that #3 is * sufficient to make this guarantee, however it's possible * (specifically in the rare L2ARC write race mentioned in * arc_buf_alloc_impl()) there will be an existing uncompressed buf that * is shareable, but wasn't at the time of its allocation. Rather than * allow a new shared uncompressed buf to be created and then shuffle * the list around to make it the last element, this simply disallows * sharing if the new buf isn't the first to be added. */ ASSERT3P(buf->b_hdr, ==, hdr); boolean_t hdr_compressed = arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF; boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; return (!ARC_BUF_ENCRYPTED(buf) && buf_compressed == hdr_compressed && hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && !HDR_SHARED_DATA(hdr) && (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); } /* * Allocate a buf for this hdr. If you care about the data that's in the hdr, * or if you want a compressed buffer, pass those flags in. Returns 0 if the * copy was made successfully, or an error code otherwise. */ static int arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, const void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; arc_fill_flags_t flags = ARC_FILL_LOCKED; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); ASSERT3P(*ret, ==, NULL); IMPLY(encrypted, compressed); buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = hdr->b_l1hdr.b_buf; buf->b_flags = 0; add_reference(hdr, tag); /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually * compressed. This must be overridden if the buffer is encrypted since * encrypted buffers cannot be decompressed. */ if (encrypted) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED; flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED; } else if (compressed && arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; flags |= ARC_FILL_COMPRESSED; } if (noauth) { ASSERT0(encrypted); flags |= ARC_FILL_NOAUTH; } /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new * buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be * actively involved in an L2ARC write, because if this buf is used by * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. * Second, the hdr's ABD must be linear so that the buf's user doesn't * need to be ABD-aware. It must be allocated via * zio_[data_]buf_alloc(), not as a page, because we need to be able * to abd_release_ownership_of_buf(), which isn't allowed on "linear * page" buffers because the ABD code needs to handle freeing them * specially. */ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd) && !abd_is_linear_page(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { buf->b_data = arc_get_data_buf(hdr, arc_buf_size(buf), buf); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; /* * If the user wants the data from the hdr, we need to either copy or * decompress the data. */ if (fill) { ASSERT3P(zb, !=, NULL); return (arc_buf_fill(buf, spa, zb, flags)); } return (0); } static const char *arc_onloan_tag = "onloan"; static inline void arc_loaned_bytes_update(int64_t delta) { atomic_add_64(&arc_loaned_bytes, delta); /* assert that it did not wrap around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); } /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, psize, lsize, compression_type, complevel); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, byteorder, salt, iv, mac, ot, psize, lsize, compression_type, complevel); atomic_add_64(&arc_loaned_bytes, psize); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); arc_loaned_bytes_update(-arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); arc_loaned_bytes_update(arc_buf_size(buf)); } static void l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } static void arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, hdr); } (void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } if (free_rdata) { l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type); } else { l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } } /* * Share the arc_buf_t's data with the hdr. Whenever we are sharing the * data buffer, we transfer the refcount ownership to the hdr and update * the appropriate kstats. */ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Start sharing the data buffer. We transfer the * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ zfs_refcount_transfer_ownership_many( &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need * to increment its compressed and uncompressed kstats and * decrement the overhead size. */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ zfs_refcount_transfer_ownership_many( &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); abd_free(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between * the arc buf and the hdr, count it as overhead. */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* * Remove an arc_buf_t from the hdr's buf list and return the last * arc_buf_t on the list. If no buffers remain on the list then return * NULL. */ static arc_buf_t * arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; arc_buf_t *lastbuf = NULL; /* * Remove the buf from the hdr list and locate the last * remaining buffer on the list. */ while (*bufp != NULL) { if (*bufp == buf) *bufp = buf->b_next; /* * If we've removed a buffer in the middle of * the list then update the lastbuf and update * bufp. */ if (*bufp != NULL) { lastbuf = *bufp; bufp = &(*bufp)->b_next; } } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); } /* * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's * list and free it. */ static void arc_buf_destroy_impl(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Free up the data associated with the buf but only if we're not * sharing this with the hdr. If we are sharing it with the hdr, the * hdr is responsible for doing the free. */ if (buf->b_data != NULL) { /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { + ASSERT(!arc_buf_is_shared(buf)); uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; /* * If we have no more encrypted buffers and we've already * gotten a copy of the decrypted data we can free b_rabd * to save some space. */ if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) { arc_buf_t *b; for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) { if (b != buf && ARC_BUF_ENCRYPTED(b)) break; } if (b == NULL) arc_hdr_free_abd(hdr, B_TRUE); } } arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * * There is an equivalent case for compressed bufs, but since * they aren't guaranteed to be the last buf in the list and * that is an exceedingly rare case, we just allow that space be * wasted temporarily. We must also be careful not to share * encrypted buffers, since they cannot be shared. */ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ - VERIFY(!arc_buf_is_shared(lastbuf)); + ASSERT(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ - VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); + ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_abd(hdr, B_FALSE); /* * We must setup a new shared block between the * last buffer and the hdr. The data would have * been allocated by the arc buf so we need to transfer * ownership to the hdr since it's now being shared. */ arc_share_buf(hdr, lastbuf); } } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT3P(lastbuf, !=, NULL); ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); } /* * Free the checksum if we're removing the last uncompressed buf from * this hdr. */ if (!arc_hdr_has_uncompressed_buf(hdr)) { arc_cksum_free(hdr); } /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata); IMPLY(alloc_rdata, HDR_PROTECTED(hdr)); if (alloc_rdata) { size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ARCSTAT_INCR(arcstat_compressed_size, size); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) { uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); IMPLY(free_rdata, HDR_HAS_RABD(hdr)); /* * If the hdr is currently being written to the l2arc then * we defer freeing the data by adding it to the l2arc_free_on_write * list. The l2arc will free the data once it's finished * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { arc_hdr_free_on_write(hdr, free_rdata); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else if (free_rdata) { arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr); } else { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr); } if (free_rdata) { hdr->b_crypt_hdr.b_rabd = NULL; ARCSTAT_INCR(arcstat_raw_size, -size); } else { hdr->b_l1hdr.b_pabd = NULL; } if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -size); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } /* * Allocate empty anonymous ARC header. The header will get its identity * assigned and buffers attached later as part of read or write operations. * * In case of read arc_read() assigns header its identify (b_dva + b_birth), * inserts it into ARC hash to become globally visible and allocates physical * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read * completion arc_read_done() allocates ARC buffer(s) as needed, potentially * sharing one of them with the physical ABD buffer. * * In case of write arc_alloc_buf() allocates ARC buffer to be filled with * data. Then after compression and/or encryption arc_write_ready() allocates * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD * buffer. On disk write completion arc_write_done() assigns the header its * new identity (b_dva + b_birth) and inserts into ARC hash. * * In case of partial overwrite the old data is read first as described. Then * arc_release() either allocates new anonymous ARC header and moves the ARC * buffer to it, or reuses the old ARC header by discarding its identity and * removing it from ARC hash. After buffer modification normal write process * follows as described. */ static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, boolean_t protected, enum zio_compress compression_type, uint8_t complevel, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(HDR_EMPTY(hdr)); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); hdr->b_spa = spa; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_buf = NULL; ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); } /* * Transition between the two allocation states for the arc_buf_hdr struct. * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller * version is used when a cache buffer is only in the L2ARC in order to reduce * memory usage. */ static arc_buf_hdr_t * arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { ASSERT(HDR_HAS_L2HDR(hdr)); arc_buf_hdr_t *nhdr; l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); memcpy(nhdr, hdr, HDR_L2ONLY_SIZE); if (new == hdr_full_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a * header has just come out of L2ARC, so we set its state to * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif /* * If we've reached here, We must have been called from * arc_evict_hdr(), as such we should have already been * removed from any ghost list we were previously on * (which protects us from racing with arc_evict_state), * thus no locking is needed during this check. */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } /* * The header has been reallocated so we need to re-insert it into any * lists it was on. */ (void) buf_hash_insert(nhdr, NULL); ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); mutex_enter(&dev->l2ad_mtx); /* * We must place the realloc'ed header back into the list at * the same spot. Otherwise, if it's placed earlier in the list, * l2arc_write_buffers() could find it during the function's * write phase, and try to write it out to the l2arc. */ list_insert_after(&dev->l2ad_buflist, hdr, nhdr); list_remove(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Since we're using the pointer address as the tag when * incrementing and decrementing the l2ad_alloc refcount, we * must remove the old pointer (that we're about to destroy) and * add the new pointer to the refcount. Otherwise we'd remove * the wrong pointer address when calling arc_hdr_destroy() later. */ (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); buf_discard_identity(hdr); kmem_cache_free(old, hdr); return (nhdr); } /* * This function is used by the send / receive code to convert a newly * allocated arc_buf_t to one that is suitable for a raw encrypted write. It * is also used to allow the root objset block to be updated without altering * its embedded MACs. Both block types will always be uncompressed so we do not * have to worry about compression type or psize. */ void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); if (salt != NULL) memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); if (iv != NULL) memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); if (mac != NULL) memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); } /* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, B_FALSE, ZIO_COMPRESS_OFF, 0, type); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } /* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ arc_buf_t * arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_FALSE, compression_type, complevel, ARC_BUFC_DATA); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); /* * To ensure that the hdr has the correct data in it if we call * arc_untransform() on this buf before it's been written to disk, * it's easiest if we just set up sharing between the buf and the hdr. */ arc_share_buf(hdr, buf); return (buf); } arc_buf_t * arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_hdr_t *hdr; arc_buf_t *buf; arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ? ARC_BUFC_METADATA : ARC_BUFC_DATA; ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, compression_type, complevel, type); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); /* * This buffer will be considered encrypted even if the ot is not an * encrypted type. It will become authenticated instead in * arc_write_ready(). */ buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); arc_buf_contents_t type = hdr->b_type; int64_t lsize_s; int64_t psize_s; int64_t asize_s; if (incr) { lsize_s = lsize; psize_s = psize; asize_s = asize; } else { lsize_s = -lsize; psize_s = -psize; asize_s = -asize; } /* If the buffer is a prefetch, count it as such. */ if (HDR_PREFETCH(hdr)) { ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s); } else { /* * We use the value stored in the L2 header upon initial * caching in L2ARC. This value will be updated in case * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC * metadata (log entry) cannot currently be updated. Having * the ARC state in the L2 header solves the problem of a * possibly absent L1 header (apparent in buffers restored * from persistent L2ARC). */ switch (hdr->b_l2hdr.b_arcs_state) { case ARC_STATE_MRU_GHOST: case ARC_STATE_MRU: ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s); break; case ARC_STATE_MFU_GHOST: case ARC_STATE_MFU: ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s); break; default: break; } } if (state_only) return; ARCSTAT_INCR(arcstat_l2_psize, psize_s); ARCSTAT_INCR(arcstat_l2_lsize, lsize_s); switch (type) { case ARC_BUFC_DATA: ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s); break; case ARC_BUFC_METADATA: ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s); break; default: break; } } static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); l2arc_hdr_arcstats_decrement(hdr); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); if (!buflist_held) mutex_enter(&dev->l2ad_mtx); /* * Even though we checked this conditional above, we * need to check this again now that we have the * l2ad_mtx. This is because we could be racing with * another thread calling l2arc_evict() which might have * destroyed this header's L2 portion as we were waiting * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ if (HDR_HAS_L2HDR(hdr)) { if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); arc_hdr_l2hdr_destroy(hdr); } if (!buflist_held) mutex_exit(&dev->l2ad_mtx); } /* * The header's identify can only be safely discarded once it is no * longer discoverable. This requires removing it from the hash table * and the l2arc header list. After this point the hash lock can not * be used to protect the header. */ if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void arc_buf_destroy(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); ASSERT(ARC_BUF_LAST(buf)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, tag)); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); arc_buf_destroy_impl(buf); (void) remove_reference(hdr, tag); mutex_exit(hash_lock); } /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost * - arc_mfu -> arc_mfu_ghost * - arc_mru_ghost -> arc_l2c_only * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted * - arc_uncached -> deleted * * Return total size of evicted data buffers for eviction progress tracking. * When evicting from ghost states return logical buffer size to make eviction * progress at the same (or at least comparable) rate as from non-ghost states. * * Return *real_evicted for actual ARC size reduction to wake up threads * waiting for it. For non-ghost states it includes size of evicted data * buffers (the headers are not freed there). For ghost states it includes * only the evicted headers size. */ static int64_t arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { /* * l2arc_write_buffers() relies on a header's L1 portion * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { ARCSTAT_BUMP(arcstat_evict_l2_skip); return (bytes_evicted); } ARCSTAT_BUMP(arcstat_deleted); bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pabd == NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ (void) arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } return (bytes_evicted); } ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached); evicted_state = (state == arc_uncached) ? arc_anon : ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost); /* prefetch buffers have a minimum lifespan */ if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < MSEC_TO_TICK(min_lifetime)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); switch (state->arcs_state) { case ARC_STATE_MRU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mru, HDR_GET_LSIZE(hdr)); break; case ARC_STATE_MFU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mfu, HDR_GET_LSIZE(hdr)); break; default: break; } } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); } } bytes_evicted += arc_hdr_size(hdr); *real_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed buffer then we * discard it here before we change states. This ensures that the * accounting is updated correctly in arc_free_data_impl(). */ if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); arc_change_state(evicted_state, hdr); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); if (evicted_state == arc_anon) { arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } else { ASSERT(HDR_IN_HASH_TABLE(hdr)); } return (bytes_evicted); } static void arc_set_need_free(void) { ASSERT(MUTEX_HELD(&arc_evict_lock)); int64_t remaining = arc_free_memory() - arc_sys_free / 2; arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters); if (aw == NULL) { arc_need_free = MAX(-remaining, 0); } else { arc_need_free = MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count)); } } static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, uint64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0, real_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint_t evict_count = zfs_arc_evict_batch_limit; ASSERT3P(marker, !=, NULL); mls = multilist_sublist_lock(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { if ((evict_count == 0) || (bytes_evicted >= bytes)) break; /* * To keep our iteration location, move the marker * forward. Since we're not holding hdr's hash lock, we * must be very careful and not remove 'hdr' from the * sublist. Otherwise, other consumers might mistake the * 'hdr' as not being on a sublist when they call the * multilist_link_active() function (they all rely on * the hash lock protecting concurrent insertions and * removals). multilist_sublist_move_forward() was * specifically implemented to ensure this is the case * (only 'marker' will be removed and re-inserted). */ multilist_sublist_move_forward(mls, marker); /* * The only case where the b_spa field should ever be * zero, is the marker headers inserted by * arc_evict_state(). It's possible for multiple threads * to be calling arc_evict_state() concurrently (e.g. * dsl_pool_close() and zio_inject_fault()), so we must * skip any markers we see from these other threads. */ if (hdr->b_spa == 0) continue; /* we're only interested in evicting buffers of a certain spa */ if (spa != 0 && hdr->b_spa != spa) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); /* * We aren't calling this function from any code path * that would already be holding a hash lock, so we're * asserting on this assumption to be defensive in case * this ever changes. Without this check, it would be * possible to incorrectly increment arcstat_mutex_miss * below (e.g. if the code changed such that we called * this function with a hash lock held). */ ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { uint64_t revicted; uint64_t evicted = arc_evict_hdr(hdr, &revicted); mutex_exit(hash_lock); bytes_evicted += evicted; real_evicted += revicted; /* * If evicted is zero, arc_evict_hdr() must have * decided to skip this header, don't increment * evict_count in this case. */ if (evicted != 0) evict_count--; } else { ARCSTAT_BUMP(arcstat_mutex_miss); } } multilist_sublist_unlock(mls); /* * Increment the count of evicted bytes, and wake up any threads that * are waiting for the count to reach this value. Since the list is * ordered by ascending aew_count, we pop off the beginning of the * list until we reach the end, or a waiter that's past the current * "count". Doing this outside the loop reduces the number of times * we need to acquire the global arc_evict_lock. * * Only wake when there's sufficient free memory in the system * (specifically, arc_sys_free/2, which by default is a bit more than * 1/64th of RAM). See the comments in arc_wait_for_eviction(). */ mutex_enter(&arc_evict_lock); arc_evict_count += real_evicted; if (arc_free_memory() > arc_sys_free / 2) { arc_evict_waiter_t *aw; while ((aw = list_head(&arc_evict_waiters)) != NULL && aw->aew_count <= arc_evict_count) { list_remove(&arc_evict_waiters, aw); cv_broadcast(&aw->aew_cv); } } arc_set_need_free(); mutex_exit(&arc_evict_lock); /* * If the ARC size is reduced from arc_c_max to arc_c_min (especially * if the average cached block is small), eviction can be on-CPU for * many seconds. To ensure that other threads that may be bound to * this CPU are able to make progress, make a voluntary preemption * call here. */ kpreempt(KPREEMPT_SYNC); return (bytes_evicted); } /* * Allocate an array of buffer headers used as placeholders during arc state * eviction. */ static arc_buf_hdr_t ** arc_state_alloc_markers(int count) { arc_buf_hdr_t **markers; markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP); for (int i = 0; i < count; i++) { markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); /* * A b_spa of 0 is used to indicate that this header is * a marker. This fact is used in arc_evict_state_impl(). */ markers[i]->b_spa = 0; } return (markers); } static void arc_state_free_markers(arc_buf_hdr_t **markers, int count) { for (int i = 0; i < count; i++) kmem_cache_free(hdr_full_cache, markers[i]); kmem_free(markers, sizeof (*markers) * count); } /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the * appropriate evict state. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so, may not catch all candidates. * It may also return without evicting as much space as requested. * * If bytes is specified using the special value ARC_EVICT_ALL, this * will evict all available (i.e. unlocked and evictable) buffers from * the given arc state; which is used by arc_flush(). */ static uint64_t arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, uint64_t bytes) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; num_sublists = multilist_get_num_sublists(ml); /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes * to evict, we want to keep trying. The markers allow us to * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ if (zthr_iscurthread(arc_evict_zthr)) { markers = arc_state_evict_markers; ASSERT3S(num_sublists, <=, arc_state_evict_marker_count); } else { markers = arc_state_alloc_markers(num_sublists); } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; mls = multilist_sublist_lock(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ while (total_evicted < bytes) { int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ for (int i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, markers[sublist_idx], spa, bytes_remaining); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; /* we've reached the end, wrap to the beginning */ if (++sublist_idx >= num_sublists) sublist_idx = 0; } /* * If we didn't evict anything during this scan, we have * no reason to believe we'll evict more during another * scan, so break the loop. */ if (scan_evicted == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); /* * When bytes is ARC_EVICT_ALL, the only way to * break the loop is when scan_evicted is zero. * In that case, we actually have evicted enough, * so we don't want to increment the kstat. */ if (bytes != ARC_EVICT_ALL) { ASSERT3S(total_evicted, <, bytes); ARCSTAT_BUMP(arcstat_evict_not_enough); } break; } } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); } if (markers != arc_state_evict_markers) arc_state_free_markers(markers, num_sublists); return (total_evicted); } /* * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might * wind up in an infinite loop, continually trying to evict buffers. */ static uint64_t arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL); if (!retry) break; } return (evicted); } /* * Evict the specified number of bytes from the state specified. This * function prevents us from trying to evict more from a state's list * than is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes) { uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, type, 0, delta)); } return (0); } /* * Adjust specified fraction, taking into account initial ghost state(s) size, * ghost hit bytes towards increasing the fraction, ghost hit bytes towards * decreasing it, plus a balance factor, controlling the decrease rate, used * to balance metadata vs data. */ static uint64_t arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down, uint_t balance) { if (total < 8 || up + down == 0) return (frac); /* * We should not have more ghost hits than ghost size, but they * may get close. Restrict maximum adjustment in that case. */ if (up + down >= total / 4) { uint64_t scale = (up + down) / (total / 8); up /= scale; down /= scale; } /* Get maximal dynamic range by choosing optimal shifts. */ int s = highbit64(total); s = MIN(64 - s, 32); uint64_t ofrac = (1ULL << 32) - frac; if (frac >= 4 * ofrac) up /= frac / (2 * ofrac + 1); up = (up << s) / (total >> (32 - s)); if (ofrac >= 4 * frac) down /= ofrac / (2 * frac + 1); down = (down << s) / (total >> (32 - s)); down = down * 100 / balance; return (frac + up - down); } /* * Evict buffers from the cache, such that arcstat_size is capped by arc_c. */ static uint64_t arc_evict(void) { uint64_t asize, bytes, total_evicted = 0; int64_t e, mrud, mrum, mfud, mfum, w; static uint64_t ogrd, ogrm, ogfd, ogfm; static uint64_t gsrd, gsrm, gsfd, gsfm; uint64_t ngrd, ngrm, ngfd, ngfm; /* Get current size of ARC states we can evict from. */ mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]); mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]); mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]); mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); uint64_t d = mrud + mfud; uint64_t m = mrum + mfum; uint64_t t = d + m; /* Get ARC ghost hits since last eviction. */ ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); uint64_t grd = ngrd - ogrd; ogrd = ngrd; ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); uint64_t grm = ngrm - ogrm; ogrm = ngrm; ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); uint64_t gfd = ngfd - ogfd; ogfd = ngfd; ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); uint64_t gfm = ngfm - ogfm; ogfm = ngfm; /* Adjust ARC states balance based on ghost hits. */ arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm, grm + gfm, grd + gfd, zfs_arc_meta_balance); arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100); arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100); asize = aggsum_value(&arc_sums.arcstat_size); int64_t wt = t - (asize - arc_c); /* * Try to reduce pinned dnodes if more than 3/4 of wanted metadata * target is not evictable or if they go over arc_dnode_limit. */ int64_t prune = 0; int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); w = wt * (int64_t)(arc_meta >> 16) >> 16; if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) > w * 3 / 4) { prune = dn / sizeof (dnode_t) * zfs_arc_dnode_reduce_percent / 100; } else if (dn > arc_dnode_limit) { prune = (dn - arc_dnode_limit) / sizeof (dnode_t) * zfs_arc_dnode_reduce_percent / 100; } if (prune > 0) arc_prune_async(prune); /* Evict MRU metadata. */ w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w)); bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e); total_evicted += bytes; mrum -= bytes; asize -= bytes; /* Evict MFU metadata. */ w = wt * (int64_t)(arc_meta >> 16) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w)); bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e); total_evicted += bytes; mfum -= bytes; asize -= bytes; /* Evict MRU data. */ wt -= m - total_evicted; w = wt * (int64_t)(arc_pd >> 16) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w)); bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e); total_evicted += bytes; mrud -= bytes; asize -= bytes; /* Evict MFU data. */ e = asize - arc_c; bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e); mfud -= bytes; total_evicted += bytes; /* * Evict ghost lists * * Size of each state's ghost list represents how much that state * may grow by shrinking the other states. Would it need to shrink * other states to zero (that is unlikely), its ghost size would be * equal to sum of other three state sizes. But excessive ghost * size may result in false ghost hits (too far back), that may * never result in real cache hits if several states are competing. * So choose some arbitraty point of 1/2 of other state sizes. */ gsrd = (mrum + mfud + mfum) / 2; e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) - gsrd; (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e); gsrm = (mrud + mfud + mfum) / 2; e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) - gsrm; (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e); gsfd = (mrud + mrum + mfum) / 2; e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) - gsfd; (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e); gsfm = (mrud + mrum + mfud) / 2; e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) - gsfm; (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e); return (total_evicted); } void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ ASSERT(!retry || spa == NULL); if (spa != NULL) guid = spa_load_guid(spa); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry); } void arc_reduce_target_size(int64_t to_free) { uint64_t c = arc_c; if (c <= arc_c_min) return; /* * All callers want the ARC to actually evict (at least) this much * memory. Therefore we reduce from the lower of the current size and * the target size. This way, even if arc_c is much higher than * arc_size (as can be the case after many calls to arc_freed(), we will * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ uint64_t asize = aggsum_value(&arc_sums.arcstat_size); if (asize < c) to_free += c - asize; arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min); /* See comment in arc_evict_cb_check() on why lock+flag */ mutex_enter(&arc_evict_lock); arc_evict_needed = B_TRUE; mutex_exit(&arc_evict_lock); zthr_wakeup(arc_evict_zthr); } /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } void arc_kmem_reap_soon(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; #ifdef _KERNEL #if defined(_ILP32) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { #if defined(_ILP32) /* reach upper limit of cache size on 32-bit */ if (zio_buf_cache[i] == NULL) break; #endif if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); kmem_cache_reap_now(zfs_btree_leaf_cache); abd_cache_reap_now(); } static boolean_t arc_evict_cb_check(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; #ifdef ZFS_DEBUG /* * This is necessary in order to keep the kstat information * up to date for tools that display kstat data such as the * mdb ::arc dcmd and the Linux crash utility. These tools * typically do not call kstat's update function, but simply * dump out stats from the most recent update. Without * this call, these commands may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this call, the data might be out of date if the * evict thread hasn't been woken recently; but that should * suffice. The arc_state_t structures can be queried * directly if more accurate information is needed. */ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); #endif /* * We have to rely on arc_wait_for_eviction() to tell us when to * evict, rather than checking if we are overflowing here, so that we * are sure to not leave arc_wait_for_eviction() waiting on aew_cv. * If we have become "not overflowing" since arc_wait_for_eviction() * checked, we need to wake it up. We could broadcast the CV here, * but arc_wait_for_eviction() may have not yet gone to sleep. We * would need to use a mutex to ensure that this function doesn't * broadcast until arc_wait_for_eviction() has gone to sleep (e.g. * the arc_evict_lock). However, the lock ordering of such a lock * would necessarily be incorrect with respect to the zthr_lock, * which is held before this function is called, and is held by * arc_wait_for_eviction() when it calls zthr_wakeup(). */ if (arc_evict_needed) return (B_TRUE); /* * If we have buffers in uncached state, evict them periodically. */ return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) && ddi_get_lbolt() - arc_last_uncached_flush > MSEC_TO_TICK(arc_min_prefetch_ms / 2))); } /* * Keep arc_size under arc_c by running arc_evict which evicts data * from the ARC. */ static void arc_evict_cb(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); /* Always try to evict from uncached state. */ arc_last_uncached_flush = ddi_get_lbolt(); evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE); evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE); /* Evict from other states only if told to. */ if (arc_evict_needed) evicted += arc_evict(); /* * If evicted is zero, we couldn't evict anything * via arc_evict(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. Additionally, zthr_iscancelled() is * checked here so that if the arc is shutting down, the * broadcast will wake any remaining arc evict waiters. */ mutex_enter(&arc_evict_lock); arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; if (!arc_evict_needed) { /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * arc_get_data_impl() sooner. */ arc_evict_waiter_t *aw; while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { cv_broadcast(&aw->aew_cv); } arc_set_need_free(); } mutex_exit(&arc_evict_lock); spl_fstrans_unmark(cookie); } static boolean_t arc_reap_cb_check(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; int64_t free_memory = arc_available_memory(); static int reap_cb_check_counter = 0; /* * If a kmem reap is already active, don't schedule more. We must * check for this because kmem_cache_reap_soon() won't actually * block on the cache being reaped (this is to prevent callers from * becoming implicitly blocked by a system-wide kmem reap -- which, * on a system with many, many full magazines, can take minutes). */ if (!kmem_cache_reap_active() && free_memory < 0) { arc_no_grow = B_TRUE; arc_warm = B_TRUE; /* * Wait at least zfs_grow_retry (default 5) seconds * before considering growing. */ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); return (B_TRUE); } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (gethrtime() >= arc_growtime) { arc_no_grow = B_FALSE; } /* * Called unconditionally every 60 seconds to reclaim unused * zstd compression and decompression context. This is done * here to avoid the need for an independent thread. */ if (!((reap_cb_check_counter++) % 60)) zfs_zstd_cache_reap_now(); return (B_FALSE); } /* * Keep enough free memory in the system by reaping the ARC's kmem * caches. To cause more slabs to be reapable, we may reduce the * target size of the cache (arc_c), causing the arc_evict_cb() * to free more buffers. */ static void arc_reap_cb(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; int64_t free_memory; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * Kick off asynchronous kmem_reap()'s of all our caches. */ arc_kmem_reap_soon(); /* * Wait at least arc_kmem_cache_reap_retry_ms between * arc_kmem_reap_soon() calls. Without this check it is possible to * end up in a situation where we spend lots of time reaping * caches, while we're near arc_c_min. Waiting here also gives the * subsequent free memory check a chance of finding that the * asynchronous reap has already freed enough memory, and we don't * need to call arc_reduce_target_size(). */ delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); /* * Reduce the target size as needed to maintain the amount of free * memory in the system at a fraction of the arc_size (1/128th by * default). If oversubscribed (free_memory < 0) then reduce the * target arc_size by the deficit amount plus the fractional * amount. If free memory is positive but less than the fractional * amount, reduce by what is needed to hit the fractional amount. */ free_memory = arc_available_memory(); int64_t can_free = arc_c - arc_c_min; if (can_free > 0) { int64_t to_free = (can_free >> arc_shrink_shift) - free_memory; if (to_free > 0) arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); } #ifdef _KERNEL /* * Determine the amount of memory eligible for eviction contained in the * ARC. All clean data reported by the ghost lists can always be safely * evicted. Due to arc_c_min, the same does not hold for all clean data * contained by the regular mru and mfu lists. * * In the case of the regular mru and mfu lists, we need to report as * much clean data as possible, such that evicting that same reported * data will not bring arc_size below arc_c_min. Thus, in certain * circumstances, the total amount of clean data in the mru and mfu * lists might not actually be evictable. * * The following two distinct cases are accounted for: * * 1. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is greater than or equal to arc_c_min. * (i.e. amount of dirty data >= arc_c_min) * * This is the easy case; all clean data contained by the mru and mfu * lists is evictable. Evicting all clean data can only drop arc_size * to the amount of dirty data, which is greater than arc_c_min. * * 2. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is less than arc_c_min. * (i.e. arc_c_min > amount of dirty data) * * 2.1. arc_size is greater than or equal arc_c_min. * (i.e. arc_size >= arc_c_min > amount of dirty data) * * In this case, not all clean data from the regular mru and mfu * lists is actually evictable; we must leave enough clean data * to keep arc_size above arc_c_min. Thus, the maximum amount of * evictable data from the two lists combined, is exactly the * difference between arc_size and arc_c_min. * * 2.2. arc_size is less than arc_c_min * (i.e. arc_c_min > arc_size > amount of dirty data) * * In this case, none of the data contained in the mru and mfu * lists is evictable, even if it's clean. Since arc_size is * already below arc_c_min, evicting any more would only * increase this negative difference. */ #endif /* _KERNEL */ /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are coming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(uint64_t bytes) { /* * Wake reap thread if we do not have any available memory */ if (arc_reclaim_needed()) { zthr_wakeup(arc_reap_zthr); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ if (aggsum_upper_bound(&arc_sums.arcstat_size) + 2 * SPA_MAXBLOCKSIZE >= arc_c) { uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE); if (atomic_add_64_nv(&arc_c, dc) > arc_c_max) arc_c = arc_c_max; } } /* * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ static arc_ovf_level_t arc_is_overflowing(boolean_t use_reserve) { /* Always allow at least one block of overflow */ int64_t overflow = MAX(SPA_MAXBLOCKSIZE, arc_c >> zfs_arc_overflow_shift); /* * We just compare the lower bound here for performance reasons. Our * primary goals are to make sure that the arc never grows without * bound, and that it can reach its maximum size. This check * accomplishes both goals. The maximum amount we could run over by is * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - overflow / 2; if (!use_reserve) overflow /= 2; return (over < 0 ? ARC_OVF_NONE : over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, alloc_flags); if (alloc_flags & ARC_HDR_ALLOC_LINEAR) return (abd_alloc_linear(size, type == ARC_BUFC_METADATA)); else return (abd_alloc(size, type == ARC_BUFC_METADATA)); } static void * arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, 0); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { ASSERT(type == ARC_BUFC_DATA); return (zio_data_buf_alloc(size)); } } /* * Wait for the specified amount of data (in bytes) to be evicted from the * ARC, and for there to be sufficient free memory in the system. Waiting for * eviction ensures that the memory used by the ARC decreases. Waiting for * free memory ensures that the system won't run out of free pages, regardless * of ARC behavior and settings. See arc_lowmem_init(). */ void arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) { switch (arc_is_overflowing(use_reserve)) { case ARC_OVF_NONE: return; case ARC_OVF_SOME: /* * This is a bit racy without taking arc_evict_lock, but the * worst that can happen is we either call zthr_wakeup() extra * time due to race with other thread here, or the set flag * get cleared by arc_evict_cb(), which is unlikely due to * big hysteresis, but also not important since at this level * of overflow the eviction is purely advisory. Same time * taking the global lock here every time without waiting for * the actual eviction creates a significant lock contention. */ if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } return; case ARC_OVF_SEVERE: default: { arc_evict_waiter_t aw; list_link_init(&aw.aew_node); cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); uint64_t last_count = 0; mutex_enter(&arc_evict_lock); if (!list_is_empty(&arc_evict_waiters)) { arc_evict_waiter_t *last = list_tail(&arc_evict_waiters); last_count = last->aew_count; } else if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } /* * Note, the last waiter's count may be less than * arc_evict_count if we are low on memory in which * case arc_evict_state_impl() may have deferred * wakeups (but still incremented arc_evict_count). */ aw.aew_count = MAX(last_count, arc_evict_count) + amount; list_insert_tail(&arc_evict_waiters, &aw); arc_set_need_free(); DTRACE_PROBE3(arc__wait__for__eviction, uint64_t, amount, uint64_t, arc_evict_count, uint64_t, aw.aew_count); /* * We will be woken up either when arc_evict_count reaches * aew_count, or when the ARC is no longer overflowing and * eviction completes. * In case of "false" wakeup, we will still be on the list. */ do { cv_wait(&aw.aew_cv, &arc_evict_lock); } while (list_link_active(&aw.aew_node)); mutex_exit(&arc_evict_lock); cv_destroy(&aw.aew_cv); } } } /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { arc_adapt(size); /* * If arc_size is currently overflowing, we must be adding data * faster than we are evicting. To ensure we don't compound the * problem by adding more data and forcing arc_size to grow even * further past it's target size, we wait for the eviction thread to * make some progress. We also wait for there to be sufficient free * memory in the system, as measured by arc_free_memory(). * * Specifically, we wait for zfs_arc_eviction_pct percent of the * requested size to be evicted. This should be more than 100%, to * ensure that that progress is also made towards getting arc_size * under arc_c. See the comment above zfs_arc_eviction_pct. */ arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, alloc_flags & ARC_HDR_USE_RESERVE); arc_buf_contents_t type = arc_buf_type(hdr); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { arc_space_consume(size, ARC_SPACE_DATA); } /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ arc_state_t *state = hdr->b_l1hdr.b_state; if (!GHOST_STATE(state)) { (void) zfs_refcount_add_many(&state->arcs_size[type], size, tag); /* * If this is reached via arc_read, the link is * protected by the hash lock. If reached via * arc_buf_alloc, the header should not be accessed by * any other thread. And, if reached via arc_read_done, * the hash lock will protect it if it's found in the * hash table; otherwise no other thread should be * trying to [add|remove]_reference it. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); (void) zfs_refcount_add_many(&state->arcs_esize[type], size, tag); } } } static void arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, const void *tag) { arc_free_data_impl(hdr, size, tag); abd_free(abd); } static void arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_free_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { zio_buf_free(buf, size); } else { ASSERT(type == ARC_BUFC_DATA); zio_data_buf_free(buf, size); } } /* * Free the arc data buffer. */ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, tag); } (void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } } /* * This routine is called whenever a buffer is accessed. */ static void arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); /* * Update buffer prefetch status. */ boolean_t was_prefetch = HDR_PREFETCH(hdr); boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH; if (was_prefetch != now_prefetch) { if (was_prefetch) { ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit, HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive, prefetch); } if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); if (was_prefetch) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); } else { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } if (now_prefetch) { if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) { arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); ARCSTAT_BUMP(arcstat_prescient_prefetch); } else { ARCSTAT_BUMP(arcstat_predictive_prefetch); } } if (arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); clock_t now = ddi_get_lbolt(); if (hdr->b_l1hdr.b_state == arc_anon) { arc_state_t *new_state; /* * This buffer is not in the cache, and does not appear in * our "ghost" lists. Add it to the MRU or uncached state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = now; if (HDR_UNCACHED(hdr)) { new_state = arc_uncached; DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *, hdr); } else { new_state = arc_mru; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mru) { /* * This buffer has been accessed once recently and either * its read is still in progress or it is in the cache. */ if (HDR_IO_IN_PROGRESS(hdr)) { hdr->b_l1hdr.b_arc_access = now; return; } hdr->b_l1hdr.b_mru_hits++; ARCSTAT_BUMP(arcstat_mru_hits); /* * If the previous access was a prefetch, then it already * handled possible promotion, so nothing more to do for now. */ if (was_prefetch) { hdr->b_l1hdr.b_arc_access = now; return; } /* * If more than ARC_MINTIME have passed from the previous * hit, promote the buffer to the MFU state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); } } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been accessed once recently, but was * evicted from the cache. Would we have bigger MRU, it * would be an MRU hit, so handle it the same way, except * we don't need to check the previous access time. */ hdr->b_l1hdr.b_mru_ghost_hits++; ARCSTAT_BUMP(arcstat_mru_ghost_hits); hdr->b_l1hdr.b_arc_access = now; wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)], arc_hdr_size(hdr)); if (was_prefetch) { new_state = arc_mru; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and either * still in the cache or being restored from one of ghosts. */ if (!HDR_IO_IN_PROGRESS(hdr)) { hdr->b_l1hdr.b_mfu_hits++; ARCSTAT_BUMP(arcstat_mfu_hits); } hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { /* * This buffer has been accessed more than once recently, but * has been evicted from the cache. Would we have bigger MFU * it would stay in cache, so move it back to MFU state. */ hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); hdr->b_l1hdr.b_arc_access = now; wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)], arc_hdr_size(hdr)); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); } else if (hdr->b_l1hdr.b_state == arc_uncached) { /* * This buffer is uncacheable, but we got a hit. Probably * a demand read after prefetch. Nothing more to do here. */ if (!HDR_IO_IN_PROGRESS(hdr)) ARCSTAT_BUMP(arcstat_uncached_hits); hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC and was not accessed * for a long time, so treat it as new and put into MRU. */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); } } /* * This routine is called by dbuf_hold() to update the arc_access() state * which otherwise would be skipped for entries in the dbuf cache. */ void arc_buf_access(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Avoid taking the hash_lock when possible as an optimization. * The header must be checked again under the hash_lock in order * to handle the case where it is concurrently being released. */ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) return; kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_access_skip); return; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu || hdr->b_l1hdr.b_state == arc_uncached); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, 0, B_TRUE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ void arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { (void) zio, (void) zb, (void) bp; if (buf == NULL) return; memcpy(arg, buf->b_data, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } /* a generic arc_read_done_func_t */ void arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { (void) zb, (void) bp; arc_buf_t **bufp = arg; if (buf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; } else { ASSERT(zio == NULL || zio->io_error == 0); *bufp = buf; ASSERT(buf->b_data != NULL); } } static void arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { ASSERT3U(arc_hdr_get_compress(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp)); } } static void arc_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; arc_callback_t *acb; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); ASSERT3P(hash_lock, !=, NULL); } if (BP_IS_PROTECTED(bp)) { hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); if (zio->io_error == 0) { if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { void *tmpbuf; tmpbuf = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmpbuf, hdr->b_crypt_hdr.b_mac); abd_return_buf(zio->io_abd, tmpbuf, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } } } if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } if (!HDR_L2_READING(hdr)) { hdr->b_complevel = zio->io_prop.zp_complevel; } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); hdr->b_l1hdr.b_acb = NULL; /* * If a read request has a callback (i.e. acb_done is not NULL), then we * make a buf containing the data according to the parameters which were * passed in. The implementation of arc_buf_alloc_impl() ensures that we * aren't needlessly decompressing the data multiple times. */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { /* We need the last one to call below in original order. */ callback_list = acb; if (!acb->acb_done || acb->acb_nobuf) continue; callback_cnt++; if (zio->io_error != 0) continue; int error = arc_buf_alloc_impl(hdr, zio->io_spa, &acb->acb_zb, acb->acb_private, acb->acb_encrypted, acb->acb_compressed, acb->acb_noauth, B_TRUE, &acb->acb_buf); /* * Assert non-speculative zios didn't fail because an * encryption key wasn't loaded */ ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) || error != EACCES); /* * If we failed to decrypt, report an error now (as the zio * layer would have done if it had done the transforms). */ if (error == ECKSUM) { ASSERT(BP_IS_PROTECTED(bp)); error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb, &zio->io_bp->blk_birth); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0); } } if (error != 0) { /* * Decompression or decryption failed. Set * io_error so that when we call acb_done * (below), we will indicate that the read * failed. Note that in the unusual case * where one callback is compressed and another * uncompressed, we will mark all of them * as failed, even though the uncompressed * one can't actually fail. In this case, * the hdr will not be anonymous, because * if there are multiple callbacks, it's * because multiple threads found the same * arc buf in the hash table. */ zio->io_error = error; } } /* * If there are multiple callbacks, we must have the hash lock, * because the only way for multiple threads to find this hdr is * in the hash table. This ensures that if there are multiple * callbacks, the hdr is not anonymous. If it were anonymous, * we couldn't use arc_buf_destroy() in the error case below. */ ASSERT(callback_cnt < 2 || hash_lock != NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); (void) remove_reference(hdr, hdr); if (hash_lock != NULL) mutex_exit(hash_lock); /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done != NULL) { if (zio->io_error != 0 && acb->acb_buf != NULL) { /* * If arc_buf_alloc_impl() fails during * decompression, the buf will still be * allocated, and needs to be freed here. */ arc_buf_destroy(acb->acb_buf, acb->acb_private); acb->acb_buf = NULL; } acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, acb->acb_buf, acb->acb_private); } if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_prev; if (acb->acb_wait) { mutex_enter(&acb->acb_wait_lock); acb->acb_wait_error = zio->io_error; acb->acb_wait = B_FALSE; cv_signal(&acb->acb_wait_cv); mutex_exit(&acb->acb_wait_lock); /* acb will be freed by the waiting thread. */ } else { kmem_free(acb, sizeof (arc_callback_t)); } } } /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0; boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; arc_buf_t *buf = NULL; int rc = 0; ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_REDACTED(bp)); /* * Normally SPL_FSTRANS will already be set since kernel threads which * expect to call the DMU interfaces will set it when created. System * calls are similarly handled by setting/cleaning the bit in the * registered callback (module/os/.../zfs/zpl_*). * * External consumers such as Lustre which call the exported DMU * interfaces may not have set SPL_FSTRANS. To avoid a deadlock * on the hash_lock always set and clear the bit. */ fstrans_cookie_t cookie = spl_fstrans_mark(); top: /* * Verify the block pointer contents are reasonable. This should * always be the case since the blkptr is protected by a checksum. * However, if there is damage it's desirable to detect this early * and treat it as a checksum error. This allows an alternate blkptr * to be tried when one is available (e.g. ditto blocks). */ if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ? BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { rc = SET_ERROR(ECKSUM); goto done; } if (!embedded_bp) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } /* * Determine if we have an L1 cache hit or a cache miss. For simplicity * we maintain encrypted data separately from compressed / uncompressed * data. If the user is requesting raw encrypted data and we don't have * that in the header we will read from disk to guarantee that we can * get it even if the encryption keys aren't loaded. */ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { boolean_t is_data = !HDR_ISTYPE_METADATA(hdr); if (HDR_IO_IN_PROGRESS(hdr)) { if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); rc = SET_ERROR(ENOENT); goto done; } zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* * This is a sync read that needs to wait for * an in-flight async read. Request that the * zio have its priority upgraded. */ zio_change_priority(head_zio, priority); DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr); arc_access(hdr, *arc_flags, B_FALSE); /* * If there are multiple threads reading the same block * and that block is not yet in the ARC, then only one * thread will do the physical I/O and all other * threads will wait until that I/O completes. * Synchronous reads use the acb_wait_cv whereas nowait * reads register a callback. Both are signalled/called * in arc_read_done. * * Errors of the physical I/O may need to be propagated. * Synchronous read errors are returned here from * arc_read_done via acb_wait_error. Nowait reads * attach the acb_zio_dummy zio to pio and * arc_read_done propagates the physical I/O's io_error * to acb_zio_dummy, and thereby to pio. */ arc_callback_t *acb = NULL; if (done || pio || *arc_flags & ARC_FLAG_WAIT) { acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_nobuf = no_buf; if (*arc_flags & ARC_FLAG_WAIT) { acb->acb_wait = B_TRUE; mutex_init(&acb->acb_wait_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT, NULL); } acb->acb_zb = *zb; if (pio != NULL) { acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); } acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb->acb_prev = acb; hdr->b_l1hdr.b_acb = acb; } mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_iohits); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, is_data, data, metadata, iohits); if (*arc_flags & ARC_FLAG_WAIT) { mutex_enter(&acb->acb_wait_lock); while (acb->acb_wait) { cv_wait(&acb->acb_wait_cv, &acb->acb_wait_lock); } rc = acb->acb_wait_error; mutex_exit(&acb->acb_wait_lock); mutex_destroy(&acb->acb_wait_lock); cv_destroy(&acb->acb_wait_cv); kmem_free(acb, sizeof (arc_callback_t)); } goto out; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu || hdr->b_l1hdr.b_state == arc_uncached); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, *arc_flags, B_TRUE); if (done && !no_buf) { ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ rc = arc_buf_alloc_impl(hdr, spa, zb, private, encrypted_read, compressed_read, noauth_read, B_TRUE, &buf); if (rc == ECKSUM) { /* * Convert authentication and decryption errors * to EIO (and generate an ereport if needed) * before leaving the ARC. */ rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, zb, &hdr->b_birth); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } } if (rc != 0) { arc_buf_destroy_impl(buf); buf = NULL; (void) remove_reference(hdr, private); } /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); } mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, is_data, data, metadata, hits); *arc_flags |= ARC_FLAG_CACHED; goto done; } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); if (*arc_flags & ARC_FLAG_CACHED_ONLY) { if (hash_lock != NULL) mutex_exit(hash_lock); rc = SET_ERROR(ENOENT); goto done; } if (hdr == NULL) { /* * This block is not in the cache or it has * embedded data. */ arc_buf_hdr_t *exists = NULL; hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } } else { /* * This block is in the ghost cache or encrypted data * was requested and we didn't have it. If it was * L2-only (and thus didn't have an L1 hdr), * we realloc the header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } if (GHOST_STATE(hdr->b_l1hdr.b_state)) { ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT0(zfs_refcount_count( &hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif } else if (HDR_IO_IN_PROGRESS(hdr)) { /* * If this header already had an IO in progress * and we are performing another IO to fetch * encrypted data we must wait until the first * IO completes so as not to confuse * arc_read_done(). This should be very rare * and so the performance impact shouldn't * matter. */ arc_callback_t *acb = kmem_zalloc( sizeof (arc_callback_t), KM_SLEEP); acb->acb_wait = B_TRUE; mutex_init(&acb->acb_wait_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT, NULL); acb->acb_zio_head = hdr->b_l1hdr.b_acb->acb_zio_head; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb->acb_prev = acb; hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); mutex_enter(&acb->acb_wait_lock); while (acb->acb_wait) { cv_wait(&acb->acb_wait_cv, &acb->acb_wait_lock); } mutex_exit(&acb->acb_wait_lock); mutex_destroy(&acb->acb_wait_lock); cv_destroy(&acb->acb_wait_cv); kmem_free(acb, sizeof (arc_callback_t)); goto top; } } if (*arc_flags & ARC_FLAG_UNCACHED) { arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); if (!encrypted_read) alloc_flags |= ARC_HDR_ALLOC_LINEAR; } /* * Take additional reference for IO_IN_PROGRESS. It stops * arc_access() from putting this header without any buffers * and so other references but obviously nonevictable onto * the evictable list of MRU or MFU state. */ add_reference(hdr, hdr); if (!embedded_bp) arc_access(hdr, *arc_flags, B_FALSE); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); size = HDR_GET_PSIZE(hdr); hdr_abd = hdr->b_crypt_hdr.b_rabd; zio_flags |= ZIO_FLAG_RAW; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); hdr_abd = hdr->b_l1hdr.b_pabd; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { zio_flags |= ZIO_FLAG_RAW_COMPRESS; } /* * For authenticated bp's, we do not ask the ZIO layer * to authenticate them since this will cause the entire * IO to fail if the key isn't loaded. Instead, we * defer authentication until arc_buf_fill(), which will * verify the data when the key is available. */ if (BP_IS_AUTHENTICATED(bp)) zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } if (BP_IS_AUTHENTICATED(bp)) arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_zb = *zb; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } /* * We count both async reads and scrub IOs as asynchronous so * that both can be upgraded in the event of a cache hit while * the read IO is still in-flight. */ if (priority == ZIO_PRIORITY_ASYNC_READ || priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); /* * At this point, we have a level 1 cache miss or a blkptr * with embedded data. Try again in L2ARC if possible. */ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); /* * Skip ARC stat bump for block pointers with embedded * data. The data are read from the blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); zfs_racct_read(size, 1); } /* Check if the spa even has l2 configured */ const boolean_t spa_has_l2 = l2arc_ndev != 0 && spa->spa_l2cache.sav_count > 0; if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. - * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && - !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && - !(l2arc_noprefetch && - (*arc_flags & ARC_FLAG_PREFETCH))) { + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); hdr->b_l2hdr.b_hits++; cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; /* * When Compressed ARC is disabled, but the * L2ARC block is compressed, arc_hdr_size() * will have returned LSIZE rather than PSIZE. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr) && HDR_GET_PSIZE(hdr) != 0) { size = HDR_GET_PSIZE(hdr); } asize = vdev_psize_to_asize(vd, size); if (asize != size) { abd = abd_alloc_for_io(asize, HDR_ISTYPE_METADATA(hdr)); cb->l2rcb_abd = abd; } else { abd = hdr_abd; } ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + asize <= vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ ASSERT3U(arc_hdr_get_compress(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, HDR_GET_PSIZE(hdr)); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) goto out; /* l2arc read error; goto zio_read() */ if (hash_lock != NULL) mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); /* * Only a spa with l2 should contribute to l2 * miss stats. (Including the case of having a * faulted cache device - that's also a miss.) */ if (spa_has_l2) { /* * Skip ARC stat bump for block pointers with * embedded data. The data are read from the * blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } } rzio = zio_read(pio, spa, bp, hdr_abd, size, arc_read_done, hdr, priority, zio_flags, zb); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } out: /* embedded bps don't actually go to disk */ if (!embedded_bp) spa_read_history_add(spa, zb, *arc_flags); spl_fstrans_unmark(cookie); return (rc); done: if (done) done(NULL, zb, bp, buf, private); if (pio && rc != 0) { zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); zio->io_error = rc; zio_nowait(zio); } goto out; } arc_prune_t * arc_add_prune_callback(arc_prune_func_t *func, void *private) { arc_prune_t *p; p = kmem_alloc(sizeof (*p), KM_SLEEP); p->p_pfunc = func; p->p_private = private; list_link_init(&p->p_node); zfs_refcount_create(&p->p_refcnt); mutex_enter(&arc_prune_mtx); zfs_refcount_add(&p->p_refcnt, &arc_prune_list); list_insert_head(&arc_prune_list, p); mutex_exit(&arc_prune_mtx); return (p); } void arc_remove_prune_callback(arc_prune_t *p) { boolean_t wait = B_FALSE; mutex_enter(&arc_prune_mtx); list_remove(&arc_prune_list, p); if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) wait = B_TRUE; mutex_exit(&arc_prune_mtx); /* wait for arc_prune_task to finish */ if (wait) taskq_wait_outstanding(arc_prune_taskq, 0); ASSERT0(zfs_refcount_count(&p->p_refcnt)); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *ptr) +{ + arc_prune_t *ap = (arc_prune_t *)ptr; + arc_prune_func_t *func = ap->p_pfunc; + + if (func != NULL) + func(ap->p_adjust, ap->p_private); + + zfs_refcount_remove(&ap->p_refcnt, func); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffers they reference. This provides a mechanism to ensure the ARC can + * honor the metadata limit and reclaim otherwise pinned ARC buffers. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +static void +arc_prune_async(uint64_t adjust) +{ + arc_prune_t *ap; + + mutex_enter(&arc_prune_mtx); + for (ap = list_head(&arc_prune_list); ap != NULL; + ap = list_next(&arc_prune_list, ap)) { + + if (zfs_refcount_count(&ap->p_refcnt) >= 2) + continue; + + zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); + ap->p_adjust = adjust; + if (taskq_dispatch(arc_prune_taskq, arc_prune_task, + ap, TQ_SLEEP) == TASKQID_INVALID) { + zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); + continue; + } + ARCSTAT_BUMP(arcstat_prune); + } + mutex_exit(&arc_prune_mtx); +} + /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; /* * We might be trying to free a block that is still doing I/O * (i.e. prefetch) or has some other reference (i.e. a dedup-ed, * dmu_sync-ed block). A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU * state and discoverable. When the txg finally syncs it detects that * the block was overridden in open context and issues an override I/O. * Since this is a dedup block, the override I/O will determine if the * block is already in the DDT. If so, then it will replace the io_bp * with the bp from the DDT and allow the I/O to finish. When the I/O * reaches the done callback, dbuf_write_override_done, it will * check to see if the io_bp and io_bp_override are identical. * If they are not, then it indicates that the bp was replaced with * the bp in the DDT and the override bp is freed. This allows * us to arrive here with a reference on a block that is being * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ if (!HDR_HAS_L1HDR(hdr) || zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if its DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); ASSERT(ARC_BUF_LAST(buf)); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; /* * If the buf is being overridden then it may already * have a hdr that is not empty. */ buf_discard_identity(hdr); arc_buf_thaw(buf); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); /* * This assignment is only valid as long as the hash_lock is * held, we must be careful not to reference state or the * b_state field after dropping the lock. */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); /* * We have to recheck this conditional again now that * we're holding the l2ad_mtx to prevent a race with * another thread which might be concurrently calling * l2arc_evict(). In that case, l2arc_evict() might have * destroyed the header's L2 portion as we were waiting * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) arc_hdr_l2hdr_destroy(hdr); mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); } /* * Do we have more than one buf? */ if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); boolean_t protected = HDR_PROTECTED(hdr); enum zio_compress compress = arc_hdr_get_compress(hdr); arc_buf_contents_t type = arc_buf_type(hdr); VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); VERIFY3S(remove_reference(hdr, tag), >, 0); - if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { + if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - VERIFY(!arc_buf_is_shared(lastbuf)); + ASSERT(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between * buf and the arc_buf_hdr_t. */ arc_unshare_buf(hdr, buf); /* * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { arc_hdr_alloc_abd(hdr, 0); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); - ASSERT(!ARC_BUF_SHARED(buf)); + ASSERT(!arc_buf_is_shared(buf)); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_size[type], arc_buf_size(buf), buf); if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many( &state->arcs_esize[type], arc_buf_size(buf), buf); } arc_cksum_verify(buf); arc_buf_unwatch(buf); /* if this is the last uncompressed buf free the checksum */ if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); mutex_exit(hash_lock); nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; (void) zfs_refcount_add_many(&arc_anon->arcs_size[type], arc_buf_size(buf), buf); } else { ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; arc_change_state(arc_anon, hdr); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } } int arc_released(arc_buf_t *buf) { return (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp); fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { + ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); callback->awcb_ready(zio, buf, callback->awcb_private); if (HDR_IO_IN_PROGRESS(hdr)) { ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ } if (BP_IS_PROTECTED(bp)) { /* ZIL blocks are written through zio_rewrite */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); if (BP_SHOULD_BYTESWAP(bp)) { if (BP_GET_LEVEL(bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED); } /* * If this block was written for raw encryption but the zio layer * ended up only authenticating it, adjust the buffer flags now. */ if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) { arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF) buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) { buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } /* this must be done after the buffer flags are adjusted */ arc_cksum_compute(buf); enum zio_compress compress; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); compress = BP_GET_COMPRESS(bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = zio->io_prop.zp_complevel; if (zio->io_error != 0 || psize == 0) goto out; /* * Fill the hdr with data. If the buffer is encrypted we have no choice * but to copy the data into b_radb. If the hdr is compressed, the data * we want is available from the zio, otherwise we can take it from * the buf. * * We might be able to share the buf's data with the hdr here. However, * doing so would cause the ARC to be full of linear ABDs if we write a * lot of shareable data. As a compromise, we check whether scattered * ABDs are allowed, and assume that if they are then the user wants * the ARC to be primarily filled with them regardless of the data being * written. Therefore, if they're allowed then we allocate one and copy * the data into it; otherwise, we share the data directly if we can. */ if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (!(HDR_UNCACHED(hdr) || abd_size_alloc_linear(arc_buf_size(buf))) || !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the * hdr's compression setting rather than the io_bp's. */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); ASSERT(ARC_BUF_LAST(buf)); arc_share_buf(hdr, buf); } out: arc_hdr_verify(hdr, bp); spl_fstrans_unmark(cookie); } static void arc_write_children_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; callback->awcb_children_ready(zio, buf, callback->awcb_private); } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT3U(zio->io_error, ==, 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists); arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); VERIFY3S(remove_reference(hdr, hdr), >, 0); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, 0, B_FALSE); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); VERIFY3S(remove_reference(hdr, hdr), >, 0); } callback->awcb_done(zio, buf, callback->awcb_private); abd_free(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *children_ready, arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; zio_prop_t localprop = *zp; ASSERT3P(ready, !=, NULL); ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); if (uncached) arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); else if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (ARC_BUF_ENCRYPTED(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); localprop.zp_encrypt = B_TRUE; localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; localprop.zp_byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { localprop.zp_nopwrite = B_FALSE; localprop.zp_copies = MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (ARC_BUF_COMPRESSED(buf)) { ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; /* * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { + ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); if (!(zio_flags & ZIO_FLAG_RAW)) arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); zio = zio_write(pio, spa, txg, bp, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_done, callback, priority, zio_flags, zb); return (zio); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (!arc_no_grow && reserve > arc_c/4 && reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT)) arc_c = MIN(arc_c_max, reserve * 4); /* * Throttle when the calculated memory footprint for the TXG * exceeds the target ARC size. */ if (reserve > arc_c) { DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); return (SET_ERROR(ERESTART)); } /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); anon_size = MAX((int64_t) (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(spa, reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * * In the case of one pool being built on another pool, we want * to make sure we don't end up throttling the lower (backing) * pool when the upper pool is the majority contributor to dirty * data. To insure we make forward progress during throttling, we * also check the current pool's net dirty data and only throttle * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty * data in the cache. * * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ uint64_t total_dirty = reserve + arc_tempreserve + anon_size; uint64_t spa_dirty_anon = spa_dirty_data(spa); uint64_t rarc_c = arc_warm ? arc_c : arc_c_max; if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 && anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 && spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { #ifdef ZFS_DEBUG uint64_t meta_esize = zfs_refcount_count( &arc_anon->arcs_esize[ARC_BUFC_METADATA]); uint64_t data_esize = zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n", (u_longlong_t)arc_tempreserve >> 10, (u_longlong_t)meta_esize >> 10, (u_longlong_t)data_esize >> 10, (u_longlong_t)reserve >> 10, (u_longlong_t)rarc_c >> 10); #endif DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *data, kstat_named_t *metadata, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { data->value.ui64 = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]); metadata->value.ui64 = zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); size->value.ui64 = data->value.ui64 + metadata->value.ui64; evict_data->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); as->arcstat_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_hits); as->arcstat_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_iohits); as->arcstat_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_misses); as->arcstat_demand_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_hits); as->arcstat_demand_data_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_iohits); as->arcstat_demand_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_misses); as->arcstat_demand_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_hits); as->arcstat_demand_metadata_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_iohits); as->arcstat_demand_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_misses); as->arcstat_prefetch_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_hits); as->arcstat_prefetch_data_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_iohits); as->arcstat_prefetch_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_misses); as->arcstat_prefetch_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits); as->arcstat_prefetch_metadata_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits); as->arcstat_prefetch_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses); as->arcstat_mru_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_hits); as->arcstat_mru_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_ghost_hits); as->arcstat_mfu_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_hits); as->arcstat_mfu_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_ghost_hits); as->arcstat_uncached_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_uncached_hits); as->arcstat_deleted.value.ui64 = wmsum_value(&arc_sums.arcstat_deleted); as->arcstat_mutex_miss.value.ui64 = wmsum_value(&arc_sums.arcstat_mutex_miss); as->arcstat_access_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_access_skip); as->arcstat_evict_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_skip); as->arcstat_evict_not_enough.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_not_enough); as->arcstat_evict_l2_cached.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_cached); as->arcstat_evict_l2_eligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible); as->arcstat_evict_l2_eligible_mfu.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu); as->arcstat_evict_l2_eligible_mru.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru); as->arcstat_evict_l2_ineligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_ineligible); as->arcstat_evict_l2_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_skip); as->arcstat_hash_collisions.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_collisions); as->arcstat_hash_chains.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_chains); as->arcstat_size.value.ui64 = aggsum_value(&arc_sums.arcstat_size); as->arcstat_compressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_compressed_size); as->arcstat_uncompressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_uncompressed_size); as->arcstat_overhead_size.value.ui64 = wmsum_value(&arc_sums.arcstat_overhead_size); as->arcstat_hdr_size.value.ui64 = wmsum_value(&arc_sums.arcstat_hdr_size); as->arcstat_data_size.value.ui64 = wmsum_value(&arc_sums.arcstat_data_size); as->arcstat_metadata_size.value.ui64 = wmsum_value(&arc_sums.arcstat_metadata_size); as->arcstat_dbuf_size.value.ui64 = wmsum_value(&arc_sums.arcstat_dbuf_size); #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + wmsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, &as->arcstat_anon_data, &as->arcstat_anon_metadata, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, &as->arcstat_mru_data, &as->arcstat_mru_metadata, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, &as->arcstat_mru_ghost_data, &as->arcstat_mru_ghost_metadata, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, &as->arcstat_mfu_data, &as->arcstat_mfu_metadata, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_data, &as->arcstat_mfu_ghost_metadata, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); arc_kstat_update_state(arc_uncached, &as->arcstat_uncached_size, &as->arcstat_uncached_data, &as->arcstat_uncached_metadata, &as->arcstat_uncached_evictable_data, &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = wmsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_hits); as->arcstat_l2_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_misses); as->arcstat_l2_prefetch_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_prefetch_asize); as->arcstat_l2_mru_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mru_asize); as->arcstat_l2_mfu_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mfu_asize); as->arcstat_l2_bufc_data_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize); as->arcstat_l2_bufc_metadata_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize); as->arcstat_l2_feeds.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_feeds); as->arcstat_l2_rw_clash.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rw_clash); as->arcstat_l2_read_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_read_bytes); as->arcstat_l2_write_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_write_bytes); as->arcstat_l2_writes_sent.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_sent); as->arcstat_l2_writes_done.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_done); as->arcstat_l2_writes_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_error); as->arcstat_l2_writes_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry); as->arcstat_l2_evict_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry); as->arcstat_l2_evict_reading.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_reading); as->arcstat_l2_evict_l1cached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_l1cached); as->arcstat_l2_free_on_write.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_free_on_write); as->arcstat_l2_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_abort_lowmem); as->arcstat_l2_cksum_bad.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_cksum_bad); as->arcstat_l2_io_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_io_error); as->arcstat_l2_lsize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_lsize); as->arcstat_l2_psize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_psize); as->arcstat_l2_hdr_size.value.ui64 = aggsum_value(&arc_sums.arcstat_l2_hdr_size); as->arcstat_l2_log_blk_writes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_writes); as->arcstat_l2_log_blk_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_asize); as->arcstat_l2_log_blk_count.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_count); as->arcstat_l2_rebuild_success.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_success); as->arcstat_l2_rebuild_abort_unsupported.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported); as->arcstat_l2_rebuild_abort_io_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors); as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); as->arcstat_l2_rebuild_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem); as->arcstat_l2_rebuild_size.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_size); as->arcstat_l2_rebuild_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_asize); as->arcstat_l2_rebuild_bufs.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs); as->arcstat_l2_rebuild_bufs_precached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached); as->arcstat_l2_rebuild_log_blks.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks); as->arcstat_memory_throttle_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_throttle_count); as->arcstat_memory_direct_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_direct_count); as->arcstat_memory_indirect_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_indirect_count); as->arcstat_memory_all_bytes.value.ui64 = arc_all_memory(); as->arcstat_memory_free_bytes.value.ui64 = arc_free_memory(); as->arcstat_memory_available_bytes.value.i64 = arc_available_memory(); as->arcstat_prune.value.ui64 = wmsum_value(&arc_sums.arcstat_prune); as->arcstat_meta_used.value.ui64 = wmsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); as->arcstat_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_predictive_prefetch); as->arcstat_demand_hit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch); as->arcstat_demand_iohit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch); as->arcstat_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_prescient_prefetch); as->arcstat_demand_hit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch); as->arcstat_demand_iohit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch); as->arcstat_raw_size.value.ui64 = wmsum_value(&arc_sums.arcstat_raw_size); as->arcstat_cached_only_in_progress.value.ui64 = wmsum_value(&arc_sums.arcstat_cached_only_in_progress); as->arcstat_abd_chunk_waste_size.value.ui64 = wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size); return (0); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the ARC eviction * code is laid out; arc_evict_state() assumes ARC buffers are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ static unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; /* * We rely on b_dva to generate evenly distributed index * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given * arc_buf_hdr_t will remain constant throughout its lifetime * (i.e. its b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. In this context full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } static unsigned int arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj) { panic("Header %p insert into arc_l2c_only %p", obj, ml); } #define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \ if ((do_warn) && (tuning) && ((tuning) != (value))) { \ cmn_err(CE_WARN, \ "ignoring tunable %s (using %llu instead)", \ (#tuning), (u_longlong_t)(value)); \ } \ } while (0) /* * Called during module initialization and periodically thereafter to * apply reasonable changes to the exposed performance tunings. Can also be * called explicitly by param_set_arc_*() functions when ARC tunables are * updated manually. Non-zero zfs_* values which differ from the currently set * values will be applied. */ void arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_min <= arc_c_max)) { arc_c_min = zfs_arc_min; arc_c = MAX(arc_c, arc_c_min); } WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose); /* Valid range: 64M - */ if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) && (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = MIN(arc_c, arc_c_max); if (arc_dnode_limit > arc_c_max) arc_dnode_limit = arc_c_max; } WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); /* Valid range: 0 - */ arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100; WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) arc_grow_retry = zfs_arc_grow_retry; /* Valid range: 1 - N */ if (zfs_arc_shrink_shift) { arc_shrink_shift = zfs_arc_shrink_shift; arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } /* Valid range: 1 - N ms */ if (zfs_arc_min_prefetch_ms) arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; /* Valid range: 1 - N ms */ if (zfs_arc_min_prescient_prefetch_ms) { arc_min_prescient_prefetch_ms = zfs_arc_min_prescient_prefetch_ms; } /* Valid range: 0 - 100 */ if (zfs_arc_lotsfree_percent <= 100) arc_lotsfree_percent = zfs_arc_lotsfree_percent; WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent, verbose); /* Valid range: 0 - */ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) arc_sys_free = MIN(zfs_arc_sys_free, allmem); WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose); } static void arc_state_multilist_init(multilist_t *ml, multilist_sublist_index_func_t *index_func, int *maxcountp) { multilist_create(ml, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func); *maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml)); } static void arc_state_init(void) { int num_sublists = 0; arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. Special index function asserts that. */ arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], arc_state_l2c_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], arc_state_l2c_multilist_index_func, &num_sublists); /* * Keep track of the number of markers needed to reclaim buffers from * any ARC state. The markers will be pre-allocated so as to minimize * the number of memory allocations performed by the eviction thread. */ arc_state_evict_marker_count = num_sublists; zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0); wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0); wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_sums.arcstat_hits, 0); wmsum_init(&arc_sums.arcstat_iohits, 0); wmsum_init(&arc_sums.arcstat_misses, 0); wmsum_init(&arc_sums.arcstat_demand_data_hits, 0); wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_data_misses, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_mru_hits, 0); wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_uncached_hits, 0); wmsum_init(&arc_sums.arcstat_deleted, 0); wmsum_init(&arc_sums.arcstat_mutex_miss, 0); wmsum_init(&arc_sums.arcstat_access_skip, 0); wmsum_init(&arc_sums.arcstat_evict_skip, 0); wmsum_init(&arc_sums.arcstat_evict_not_enough, 0); wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0); wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0); wmsum_init(&arc_sums.arcstat_hash_collisions, 0); wmsum_init(&arc_sums.arcstat_hash_chains, 0); aggsum_init(&arc_sums.arcstat_size, 0); wmsum_init(&arc_sums.arcstat_compressed_size, 0); wmsum_init(&arc_sums.arcstat_uncompressed_size, 0); wmsum_init(&arc_sums.arcstat_overhead_size, 0); wmsum_init(&arc_sums.arcstat_hdr_size, 0); wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); wmsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0); wmsum_init(&arc_sums.arcstat_l2_feeds, 0); wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0); wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0); wmsum_init(&arc_sums.arcstat_l2_writes_done, 0); wmsum_init(&arc_sums.arcstat_l2_writes_error, 0); wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0); wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0); wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0); wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0); wmsum_init(&arc_sums.arcstat_l2_io_error, 0); wmsum_init(&arc_sums.arcstat_l2_lsize, 0); wmsum_init(&arc_sums.arcstat_l2_psize, 0); aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0); wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0); wmsum_init(&arc_sums.arcstat_memory_direct_count, 0); wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0); wmsum_init(&arc_sums.arcstat_prune, 0); wmsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_raw_size, 0); wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0); wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; arc_mfu->arcs_state = ARC_STATE_MFU; arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; arc_uncached->arcs_state = ARC_STATE_UNCACHED; } static void arc_state_fini(void) { zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]); wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); wmsum_fini(&arc_sums.arcstat_hits); wmsum_fini(&arc_sums.arcstat_iohits); wmsum_fini(&arc_sums.arcstat_misses); wmsum_fini(&arc_sums.arcstat_demand_data_hits); wmsum_fini(&arc_sums.arcstat_demand_data_iohits); wmsum_fini(&arc_sums.arcstat_demand_data_misses); wmsum_fini(&arc_sums.arcstat_demand_metadata_hits); wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits); wmsum_fini(&arc_sums.arcstat_demand_metadata_misses); wmsum_fini(&arc_sums.arcstat_prefetch_data_hits); wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_data_misses); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses); wmsum_fini(&arc_sums.arcstat_mru_hits); wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); wmsum_fini(&arc_sums.arcstat_mfu_hits); wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits); wmsum_fini(&arc_sums.arcstat_uncached_hits); wmsum_fini(&arc_sums.arcstat_deleted); wmsum_fini(&arc_sums.arcstat_mutex_miss); wmsum_fini(&arc_sums.arcstat_access_skip); wmsum_fini(&arc_sums.arcstat_evict_skip); wmsum_fini(&arc_sums.arcstat_evict_not_enough); wmsum_fini(&arc_sums.arcstat_evict_l2_cached); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru); wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible); wmsum_fini(&arc_sums.arcstat_evict_l2_skip); wmsum_fini(&arc_sums.arcstat_hash_collisions); wmsum_fini(&arc_sums.arcstat_hash_chains); aggsum_fini(&arc_sums.arcstat_size); wmsum_fini(&arc_sums.arcstat_compressed_size); wmsum_fini(&arc_sums.arcstat_uncompressed_size); wmsum_fini(&arc_sums.arcstat_overhead_size); wmsum_fini(&arc_sums.arcstat_hdr_size); wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); wmsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize); wmsum_fini(&arc_sums.arcstat_l2_mru_asize); wmsum_fini(&arc_sums.arcstat_l2_mfu_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize); wmsum_fini(&arc_sums.arcstat_l2_feeds); wmsum_fini(&arc_sums.arcstat_l2_rw_clash); wmsum_fini(&arc_sums.arcstat_l2_read_bytes); wmsum_fini(&arc_sums.arcstat_l2_write_bytes); wmsum_fini(&arc_sums.arcstat_l2_writes_sent); wmsum_fini(&arc_sums.arcstat_l2_writes_done); wmsum_fini(&arc_sums.arcstat_l2_writes_error); wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_reading); wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached); wmsum_fini(&arc_sums.arcstat_l2_free_on_write); wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_cksum_bad); wmsum_fini(&arc_sums.arcstat_l2_io_error); wmsum_fini(&arc_sums.arcstat_l2_lsize); wmsum_fini(&arc_sums.arcstat_l2_psize); aggsum_fini(&arc_sums.arcstat_l2_hdr_size); wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes); wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize); wmsum_fini(&arc_sums.arcstat_l2_log_blk_count); wmsum_fini(&arc_sums.arcstat_l2_rebuild_success); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_rebuild_size); wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached); wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks); wmsum_fini(&arc_sums.arcstat_memory_throttle_count); wmsum_fini(&arc_sums.arcstat_memory_direct_count); wmsum_fini(&arc_sums.arcstat_memory_indirect_count); wmsum_fini(&arc_sums.arcstat_prune); wmsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); wmsum_fini(&arc_sums.arcstat_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_raw_size); wmsum_fini(&arc_sums.arcstat_cached_only_in_progress); wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size); } uint64_t arc_target_bytes(void) { return (arc_c); } void arc_set_limits(uint64_t allmem) { /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); /* How to set default max varies by platform. */ arc_c_max = arc_default_max(arc_c_min, allmem); } void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t), offsetof(arc_evict_waiter_t, aew_node)); arc_min_prefetch_ms = 1000; arc_min_prescient_prefetch_ms = 6000; #if defined(_KERNEL) arc_lowmem_init(); #endif arc_set_limits(allmem); #ifdef _KERNEL /* * If zfs_arc_max is non-zero at init, meaning it was set in the kernel * environment before the module was loaded, don't block setting the * maximum because it is less than arc_c_min, instead, reset arc_c_min * to a lower value. * zfs_arc_min will be handled by arc_tuning_update(). */ if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX && zfs_arc_max < allmem) { arc_c_max = zfs_arc_max; if (arc_c_min >= arc_c_max) { arc_c_min = MAX(zfs_arc_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); } } #else /* * In userland, there's only the memory pressure that we artificially * create (see arc_available_memory()). Don't let arc_c get too * small, because it can cause transactions to be larger than * arc_c, causing arc_tempreserve_space() to fail. */ arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); #endif arc_c = arc_c_min; /* * 32-bit fixed point fractions of metadata from total ARC size, * MRU data from all data and MRU metadata from all metadata. */ arc_meta = (1ULL << 32) / 4; /* Metadata is 25% of arc_c. */ arc_pd = (1ULL << 32) / 2; /* Data MRU is 50% of data. */ arc_pm = (1ULL << 32) / 2; /* Metadata MRU is 50% of metadata. */ percent = MIN(zfs_arc_dnode_limit_percent, 100); arc_dnode_limit = arc_c_max * percent / 100; /* Apply user specified tunings */ arc_tuning_update(B_TRUE); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; arc_register_hotplug(); arc_state_init(); buf_init(); list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads, defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } arc_state_evict_markers = arc_state_alloc_markers(arc_state_evict_marker_count); arc_evict_zthr = zthr_create_timer("arc_evict", arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri); arc_reap_zthr = zthr_create_timer("arc_reap", arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri); arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by a module parameter, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4G or 25% of physical memory). */ #ifdef __LP64__ if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #else if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #endif if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = allmem * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } if (zfs_wrlog_data_max == 0) { /* * dp_wrlog_total is reduced for each txg at the end of * spa_sync(). However, dp_dirty_total is reduced every time * a block is written out. Thus under normal operation, * dp_wrlog_total could grow 2 times as big as * zfs_dirty_data_max. */ zfs_wrlog_data_max = zfs_dirty_data_max * 2; } } void arc_fini(void) { arc_prune_t *p; #ifdef _KERNEL arc_lowmem_fini(); #endif /* _KERNEL */ /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } taskq_wait(arc_prune_taskq); taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); while ((p = list_remove_head(&arc_prune_list)) != NULL) { zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } mutex_exit(&arc_prune_mtx); list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); (void) zthr_cancel(arc_evict_zthr); (void) zthr_cancel(arc_reap_zthr); arc_state_free_markers(arc_state_evict_markers, arc_state_evict_marker_count); mutex_destroy(&arc_evict_lock); list_destroy(&arc_evict_waiters); /* * Free any buffers that were tagged for destruction. This needs * to occur before arc_state_fini() runs and destroys the aggsum * values which are updated when freeing scatter ABDs. */ l2arc_do_free_on_write(); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may * trigger the release of kmem magazines, which can callback to * arc_space_return() which accesses aggsums freed in act_state_fini(). */ buf_fini(); arc_state_fini(); arc_unregister_hotplug(); /* * We destroy the zthrs after all the ARC state has been * torn down to avoid the case of them receiving any * wakeup() signals after they are destroyed. */ zthr_destroy(arc_evict_zthr); zthr_destroy(arc_reap_zthr); ASSERT0(arc_loaned_bytes); } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. * * L2ARC persistence: * * When writing buffers to L2ARC, we periodically add some metadata to * make sure we can pick them up after reboot, thus dramatically reducing * the impact that any downtime has on the performance of storage systems * with large caches. * * The implementation works fairly simply by integrating the following two * modifications: * * *) When writing to the L2ARC, we occasionally write a "l2arc log block", * which is an additional piece of metadata which describes what's been * written. This allows us to rebuild the arc_buf_hdr_t structures of the * main ARC buffers. There are 2 linked-lists of log blocks headed by * dh_start_lbps[2]. We alternate which chain we append to, so they are * time-wise and offset-wise interleaved, but that is an optimization rather * than for correctness. The log block also includes a pointer to the * previous block in its chain. * * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device * for our header bookkeeping purposes. This contains a device header, * which contains our top-level reference structures. We update it each * time we write a new log block, so that we're able to locate it in the * L2ARC device. If this write results in an inconsistent device header * (e.g. due to power failure), we detect this by verifying the header's * checksum and simply fail to reconstruct the L2ARC after reboot. * * Implementation diagram: * * +=== L2ARC device (not to scale) ======================================+ * | ___two newest log block pointers__.__________ | * | / \dh_start_lbps[1] | * | / \ \dh_start_lbps[0]| * |.___/__. V V | * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| * || hdr| ^ /^ /^ / / | * |+------+ ...--\-------/ \-----/--\------/ / | * | \--------------/ \--------------/ | * +======================================================================+ * * As can be seen on the diagram, rather than using a simple linked list, * we use a pair of linked lists with alternating elements. This is a * performance enhancement due to the fact that we only find out the * address of the next log block access once the current block has been * completely read in. Obviously, this hurts performance, because we'd be * keeping the device's I/O queue at only a 1 operation deep, thus * incurring a large amount of I/O round-trip latency. Having two lists * allows us to fetch two log blocks ahead of where we are currently * rebuilding L2ARC buffers. * * On-device data structures: * * L2ARC device header: l2arc_dev_hdr_phys_t * L2ARC log block: l2arc_log_blk_phys_t * * L2ARC reconstruction: * * When writing data, we simply write in the standard rotary fashion, * evicting buffers as we go and simply writing new data over them (writing * a new log block every now and then). This obviously means that once we * loop around the end of the device, we will start cutting into an already * committed log block (and its referenced data buffers), like so: * * current write head__ __old tail * \ / * V V * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> * ^ ^^^^^^^^^___________________________________ * | \ * <> may overwrite this blk and/or its bufs --' * * When importing the pool, we detect this situation and use it to stop * our scanning process (see l2arc_rebuild). * * There is one significant caveat to consider when rebuilding ARC contents * from an L2ARC device: what about invalidated buffers? Given the above * construction, we cannot update blocks which we've already written to amend * them to remove buffers which were invalidated. Thus, during reconstruction, * we might be populating the cache with buffers for data that's not on the * main pool anymore, or may have been overwritten! * * As it turns out, this isn't a problem. Every arc_read request includes * both the DVA and, crucially, the birth TXG of the BP the caller is * looking for. So even if the cache were populated by completely rotten * blocks for data that had been long deleted and/or overwritten, we'll * never actually return bad data from the cache, since the DVA with the * birth TXG uniquely identify a block in space and time - once created, * a block is immutable on disk. The worst thing we have done is wasted * some time and memory at l2arc rebuild to reconstruct outdated ARC * entries that will get dropped from the l2arc as it is being updated * with new blocks. * * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write * hand are not restored. This is done by saving the offset (in bytes) * l2arc_evict() has evicted to in the L2ARC device header and taking it * into account when restoring buffers. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); return (B_TRUE); } static uint64_t l2arc_write_size(l2arc_dev_t *dev) { uint64_t size; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " "be greater than zero, resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; /* We need to add in the worst case scenario of log block overhead. */ size += l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { /* * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) * times the writesize, whichever is greater. */ size += MAX(64 * 1024 * 1024, (size * l2arc_trim_ahead) / 100); } /* * Make sure the write size does not exceed the size of the cache * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ if (size > dev->l2ad_end - dev->l2ad_start) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " "(guid %llu), resetting them to the default (%d)", (u_longlong_t)l2arc_log_blk_overhead(size, dev), (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; if (l2arc_trim_ahead > 1) { cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); l2arc_trim_ahead = 1; } if (arc_warm == B_FALSE) size += l2arc_write_boost; size += l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { size += MAX(64 * 1024 * 1024, (size * l2arc_trim_ahead) / 100); } } return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; ASSERT3P(next, !=, NULL); } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write(void) { l2arc_data_free_t *df; mutex_enter(&l2arc_free_on_write_mtx); while ((df = list_remove_head(l2arc_free_on_write)) != NULL) { ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_lb_abd_buf_t *abd_buf; l2arc_lb_ptr_buf_t *lb_ptr_buf; l2arc_dev_t *dev; l2arc_dev_hdr_phys_t *l2dhdr; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); /* * All writes completed, or an error was hit. */ top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. We must retry so we * don't leave the ARC_FLAG_L2_WRITING bit set. */ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); /* * We don't want to rescan the headers we've * already marked as having been written out, so * we reinsert the head node so we can pick up * where we left off. */ list_remove(buflist, head); list_insert_after(buflist, hdr, head); mutex_exit(&dev->l2ad_mtx); /* * We wait for the hash lock to become available * to try and prevent busy waiting, and increase * the chance we'll be able to acquire the lock * the next time around. */ mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * We could not have been moved into the arc_l2c_only * state while in-flight due to our ARC_FLAG_L2_WRITING * bit being set. Let's just ensure that's being enforced. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * Skipped - drop L2ARC entry and mark the header as no * longer L2 eligibile. */ if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); uint64_t psize = HDR_GET_PSIZE(hdr); l2arc_hdr_arcstats_decrement(hdr); bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } /* * Free the allocated abd buffers for writing the log blocks. * If the zio failed reclaim the allocated space and remove the * pointers to these log blocks from the log block pointer list * of the L2ARC device. */ while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { abd_free(abd_buf->abd); zio_buf_free(abd_buf, sizeof (*abd_buf)); if (zio->io_error != 0) { lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); /* * L2BLK_GET_PSIZE returns aligned size for log * blocks. */ uint64_t asize = L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); bytes_dropped += asize; ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } list_destroy(&cb->l2wcb_abd_list); if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_writes_error); /* * Restore the lbps array in the header to its previous state. * If the list of log block pointers is empty, zero out the * log block pointers in the device header. */ lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); for (int i = 0; i < 2; i++) { if (lb_ptr_buf == NULL) { /* * If the list is empty zero out the device * header. Otherwise zero out the second log * block pointer in the header. */ if (i == 0) { memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); } else { memset(&l2dhdr->dh_start_lbps[i], 0, sizeof (l2arc_log_blkptr_t)); } break; } memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, lb_ptr_buf); } } ARCSTAT_BUMP(arcstat_l2_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } static int l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) { int ret; spa_t *spa = zio->io_spa; arc_buf_hdr_t *hdr = cb->l2rcb_hdr; blkptr_t *bp = zio->io_bp; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* * ZIL data is never be written to the L2ARC, so we don't need * special handling for its unique MAC storage. */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the data was encrypted, decrypt it now. Note that * we must check the bp here and not the hdr, since the * hdr does not have its encryption parameters updated * until arc_read_done(). */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, HDR_GET_PSIZE(hdr), eabd, hdr->b_l1hdr.b_pabd, &no_crypt); if (ret != 0) { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); goto error; } /* * If we actually performed decryption, replace b_pabd * with the decrypted data. Otherwise we can just throw * our decryption buffer away. */ if (!no_crypt) { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = eabd; zio->io_abd = eabd; } else { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); } } /* * If the L2ARC block was compressed, but ARC compression * is disabled we decompress the data into a new buffer and * replace the existing data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; zio->io_abd = cabd; zio->io_size = HDR_GET_LSIZE(hdr); } return (0); error: return (ret); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { int tfm_error = 0; l2arc_read_callback_t *cb = zio->io_private; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; boolean_t valid_cksum; boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) && (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT)); ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * If the data was read into a temporary buffer, * move it and free the buffer. */ if (cb->l2rcb_abd != NULL) { ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); if (zio->io_error == 0) { if (using_rdata) { abd_copy(hdr->b_crypt_hdr.b_rabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } else { abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } } /* * The following must be done regardless of whether * there was an error: * - free the temporary buffer * - point zio to the real ARC buffer * - set zio size accordingly * These are required because zio is either re-used for * an I/O of the block in the case of the error * or the zio is passed to arc_read_done() and it * needs real data. */ abd_free(cb->l2rcb_abd); zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); if (using_rdata) { ASSERT(HDR_HAS_RABD(hdr)); zio->io_abd = zio->io_orig_abd = hdr->b_crypt_hdr.b_rabd; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; } } ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ zio->io_prop.zp_complevel = hdr->b_complevel; valid_cksum = arc_cksum_is_equal(hdr, zio); /* * b_rabd will always match the data as it exists on disk if it is * being used. Therefore if we are reading into b_rabd we do not * attempt to untransform the data. */ if (valid_cksum && !using_rdata) tfm_error = l2arc_untransform(zio, cb); if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); } else { /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!valid_cksum || tfm_error != 0) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); void *abd = (using_rdata) ? hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio = zio_read(pio, zio->io_spa, zio->io_bp, abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); /* * Original ZIO will be freed, so we need to update * ARC header with the new ZIO pointer to be used * by zio_change_priority() in arc_read(). */ for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; acb != NULL; acb = acb->acb_next) acb->acb_zio_head = zio; mutex_exit(hash_lock); zio_nowait(zio); } else { mutex_exit(hash_lock); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static multilist_sublist_t * l2arc_sublist_lock(int list_num) { multilist_t *ml = NULL; unsigned int idx; ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES); switch (list_num) { case 0: ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; default: return (NULL); } /* * Return a randomly-selected sublist. This is acceptable * because the caller feeds only a little bit of data for each * call (8MB). Subsequent calls will result in different * sublists being selected. */ idx = multilist_get_random_index(ml); return (multilist_sublist_lock(ml, idx)); } /* * Calculates the maximum overhead of L2ARC metadata log blocks for a given * L2ARC write size. l2arc_evict and l2arc_write_size need to include this * overhead in processing to make sure there is enough headroom available * when writing buffers. */ static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) { if (dev->l2ad_log_entries == 0) { return (0); } else { uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; uint64_t log_blocks = (log_entries + dev->l2ad_log_entries - 1) / dev->l2ad_log_entries; return (vdev_psize_to_asize(dev->l2ad_vdev, sizeof (l2arc_log_blk_phys_t)) * log_blocks); } } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; vdev_t *vd = dev->l2ad_vdev; boolean_t rerun; buflist = &dev->l2ad_buflist; top: rerun = B_FALSE; if (dev->l2ad_hand + distance > dev->l2ad_end) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands * to the start and iterate. This iteration does not * happen indefinitely as we make sure in * l2arc_write_size() that when the write hand is reset, * the write size does not exceed the end of the device. */ rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); if (!all) { /* * This check has to be placed after deciding whether to * iterate (rerun). */ if (dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. We have already trimmmed the * whole device. */ goto out; } else { /* * Trim the space to be evicted. */ if (vd->vdev_has_trim && dev->l2ad_evict < taddr && l2arc_trim_ahead > 0) { /* * We have to drop the spa_config lock because * vdev_trim_range() will acquire it. * l2ad_evict already accounts for the label * size. To prevent vdev_trim_ranges() from * adding it again, we subtract it from * l2ad_evict. */ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); vdev_trim_simple(vd, dev->l2ad_evict - VDEV_LABEL_START_SIZE, taddr - dev->l2ad_evict); spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, RW_READER); } /* * When rebuilding L2ARC we retrieve the evict hand * from the header of the device. Of note, l2arc_evict() * does not actually delete buffers from the cache * device, but trimming may do so depending on the * hardware implementation. Thus keeping track of the * evict hand is useful. */ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); } } retry: mutex_enter(&dev->l2ad_mtx); /* * We have to account for evicted log blocks. Run vdev_space_update() * on log blocks whose offset (in bytes) is before the evicted offset * (in bytes) by searching in the list of pointers to log blocks * present in the L2ARC device. */ for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; lb_ptr_buf = lb_ptr_buf_prev) { lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE( (lb_ptr_buf->lb_ptr)->lbp_prop); /* * We don't worry about log blocks left behind (ie * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() * will never write more than l2arc_evict() evicts. */ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { vdev_space_update(vd, -asize, 0, 0); ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); ASSERT(!HDR_EMPTY(hdr)); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto retry; } /* * A header can't be on this list if it doesn't have L2 header. */ ASSERT(HDR_HAS_L2HDR(hdr)); /* Ensure this header has finished being written. */ ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); out: /* * We need to check if we evict all buffers, otherwise we may iterate * unnecessarily. */ if (!all && rerun) { /* * Bump device hand to the device start if it is approaching the * end. l2arc_evict() has already evicted ahead for this case. */ dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; goto top; } if (!all) { /* * In case of cache device removal (all) the following * assertions may be violated without functional consequences * as the device is about to be removed. */ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } } /* * Handle any abd transforms that might be required for writing to the L2ARC. * If successful, this function will always return an abd with the data * transformed as it is on disk in a new abd of asize bytes. */ static int l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, abd_t **abd_out) { int ret; void *tmp = NULL; abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; enum zio_compress compress = HDR_GET_COMPRESS(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t size = arc_hdr_size(hdr); boolean_t ismd = HDR_ISTYPE_METADATA(hdr); boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); dsl_crypto_key_t *dck = NULL; uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 }; boolean_t no_crypt = B_FALSE; ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) || HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize); ASSERT3U(psize, <=, asize); /* * If this data simply needs its own buffer, we simply allocate it * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ if (HDR_HAS_RABD(hdr) && asize != psize) { ASSERT3U(asize, >=, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); if (psize != asize) abd_zero_off(to_write, psize, asize - psize); goto out; } if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) && !HDR_ENCRYPTED(hdr)) { ASSERT3U(size, ==, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); if (size != asize) abd_zero_off(to_write, size, asize - size); goto out; } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { /* * In some cases, we can wind up with size > asize, so * we need to opt for the larger allocation option here. * * (We also need abd_return_buf_copy in all cases because * it's an ASSERT() to modify the buffer before returning it * with arc_return_buf(), and all the compressors * write things before deciding to fail compression in nearly * every case.) */ uint64_t bufsize = MAX(size, asize); cabd = abd_alloc_for_io(bufsize, ismd); tmp = abd_borrow_buf(cabd, bufsize); psize = zio_compress_data(compress, to_write, &tmp, size, hdr->b_complevel); if (psize >= asize) { psize = HDR_GET_PSIZE(hdr); abd_return_buf_copy(cabd, tmp, bufsize); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); to_write = cabd; abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); if (psize != asize) abd_zero_off(to_write, psize, asize - psize); goto encrypt; } ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); if (psize < asize) memset((char *)tmp + psize, 0, bufsize - psize); psize = HDR_GET_PSIZE(hdr); abd_return_buf_copy(cabd, tmp, bufsize); to_write = cabd; } encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); /* * If the dataset was disowned before the buffer * made it to this point, the key to re-encrypt * it won't be available. In this case we simply * won't write the buffer to the L2ARC. */ ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj, FTAG, &dck); if (ret != 0) goto error; ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key, hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) abd_copy(eabd, to_write, psize); if (psize != asize) abd_zero_off(eabd, psize, asize - psize); /* assert that the MAC we got here matches the one we saved */ ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); spa_keystore_dsl_key_rele(spa, dck, FTAG); if (to_write == cabd) abd_free(cabd); to_write = eabd; } out: ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd); *abd_out = to_write; return (0); error: if (dck != NULL) spa_keystore_dsl_key_rele(spa, dck, FTAG); if (cabd != NULL) abd_free(cabd); if (eabd != NULL) abd_free(eabd); *abd_out = NULL; return (ret); } static void l2arc_blk_fetch_done(zio_t *zio) { l2arc_read_callback_t *cb; cb = zio->io_private; if (cb->l2rcb_abd != NULL) abd_free(cb->l2rcb_abd); kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment and the * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; uint64_t write_asize, write_psize, write_lsize, headroom; boolean_t full; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; write_lsize = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); /* * Copy buffers for L2ARC writing. */ for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) { /* * If pass == 1 or 3, we cache MRU metadata and data * respectively. */ if (l2arc_mfuonly) { if (pass == 1 || pass == 3) continue; } multilist_sublist_t *mls = l2arc_sublist_lock(pass); uint64_t passed_sz = 0; VERIFY3P(mls, !=, NULL); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; abd_t *to_write = NULL; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. */ continue; } passed_sz += HDR_GET_LSIZE(hdr); if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); break; } if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); /* * If the allocated size of this buffer plus the max * size for the pending log block exceeds the evicted * target size, terminate writing buffers for this run. */ if (write_asize + asize + sizeof (l2arc_log_blk_phys_t) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; } /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); /* * If this header has b_rabd, we can use this since it * must always match the data exactly as it exists on * disk. Otherwise, the L2ARC can normally use the * hdr's data, but if we're sharing data between the * hdr and one of its bufs, L2ARC needs its own copy of * the data so that the ZIO below can't race with the * buf consumer. To ensure that this copy will be * available for the lifetime of the ZIO and be cleaned * up afterwards, we add it to the l2arc_free_on_write * queue. If we need to apply any transforms to the * data (compression, encryption) we will also need the * extra buffer. */ if (HDR_HAS_RABD(hdr) && psize == asize) { to_write = hdr->b_crypt_hdr.b_rabd; } else if ((HDR_COMPRESSION_ENABLED(hdr) || HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) && !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) && psize == asize) { to_write = hdr->b_l1hdr.b_pabd; } else { int ret; arc_buf_contents_t type = arc_buf_type(hdr); ret = l2arc_apply_transforms(spa, hdr, asize, &to_write); if (ret != 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); continue; } l2arc_free_abd_on_write(to_write, asize, type); } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; /* * Create a list to save allocated abd buffers * for l2arc_log_blk_commit(). */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; hdr->b_l2hdr.b_arcs_state = hdr->b_l1hdr.b_state->arcs_state; arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); write_lsize += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_exit(hash_lock); /* * Append buf info to current log and commit if full. * arcstat_l2_{size,asize} kstats are updated * internally. */ if (l2arc_log_blk_insert(dev, hdr)) { /* * l2ad_hand will be adjusted in * l2arc_log_blk_commit(). */ write_asize += l2arc_log_blk_commit(dev, pio, cb); } zio_nowait(wzio); } multilist_sublist_unlock(mls); if (full == B_TRUE) break; } /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); /* * Although we did not write any buffers l2ad_evict may * have advanced. */ if (dev->l2ad_evict != l2dhdr->dh_evict) l2arc_dev_hdr_update(dev); return (0); } if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; /* * Update the device header after the zio completes as * l2arc_write_done() may have updated the memory holding the log block * pointers in the device header. */ l2arc_dev_hdr_update(dev); return (write_asize); } static boolean_t l2arc_hdr_limit_reached(void) { int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size); return (arc_reclaim_needed() || (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ static __attribute__((noreturn)) void l2arc_feed_thread(void *unused) { (void) unused; callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); fstrans_cookie_t cookie; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); cookie = spl_fstrans_mark(); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_idle(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } spl_fstrans_unmark(cookie); l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { return (l2arc_vdev_get(vd) != NULL); } /* * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if * the vdev_t isn't an L2ARC device. */ l2arc_dev_t * l2arc_vdev_get(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev); } static void l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; spa_t *spa = dev->l2ad_spa; /* * The L2ARC has to hold at least the payload of one log block for * them to be restored (persistent L2ARC). The payload of a log block * depends on the amount of its log entries. We always write log blocks * with 1022 entries. How many of them are committed or restored depends * on the size of the L2ARC device. Thus the maximum payload of * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device * is less than that, we reduce the amount of committed and restored * log entries per block so as to enable persistence. */ if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { dev->l2ad_log_entries = 0; } else { dev->l2ad_log_entries = MIN((dev->l2ad_end - dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, L2ARC_LOG_BLK_MAX_ENTRIES); } /* * Read the device header, if an error is returned do not rebuild L2ARC. */ if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, * we should evict all ARC buffers and pointers to log blocks * and reclaim their space before restoring its contents to * L2ARC. */ if (reopen) { if (!l2arc_rebuild_enabled) { return; } else { l2arc_evict(dev, 0, B_TRUE); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; } } /* * Just mark the device as pending for a rebuild. We won't * be starting a rebuild in line here as it would block pool * import. Instead spa_load_impl will hand that off to an * async task which will call l2arc_spa_rebuild_start. */ dev->l2ad_rebuild = B_TRUE; } else if (spa_writeable(spa)) { /* * In this case TRIM the whole device if l2arc_trim_ahead > 0, * otherwise create a new header. We zero out the memory holding * the header to reset dh_start_lbps. If we TRIM the whole * device the new header will be written by * vdev_trim_l2arc_thread() at the end of the TRIM to update the * trim_state in the header too. When reading the header, if * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 * we opt to TRIM the whole device again. */ if (l2arc_trim_ahead > 0) { dev->l2ad_trim_all = B_TRUE; } else { memset(l2dhdr, 0, l2dhdr_asize); l2arc_dev_hdr_update(dev); } } } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); /* * Create a new l2arc device entry. */ adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; /* leave extra size for an l2arc device header */ l2dhdr_asize = adddev->l2ad_dev_hdr_asize = MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; adddev->l2ad_trim_all = B_FALSE; list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); /* * This is a list of pointers to log blocks that are still present * on the device. */ list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), offsetof(l2arc_lb_ptr_buf_t, node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); zfs_refcount_create(&adddev->l2ad_lb_asize); zfs_refcount_create(&adddev->l2ad_lb_count); /* * Decide if dev is eligible for L2ARC rebuild or whole device * trimming. This has to happen before the device is added in the * cache device list and l2arc_dev_mtx is released. Otherwise * l2arc_feed_thread() might already start writing on the * device. */ l2arc_rebuild_dev(adddev, B_FALSE); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen() * in case of onlining a cache device. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) { l2arc_dev_t *dev = NULL; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); /* * In contrast to l2arc_add_vdev() we do not have to worry about * l2arc_feed_thread() invalidating previous content when onlining a * cache device. The device parameters (l2ad*) are not cleared when * offlining the device and writing new buffers will not invalidate * all previous content. In worst case only buffers that have not had * their log block written to the device will be lost. * When onlining the cache device (ie offline->online without exporting * the pool in between) this happens: * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev() * | | * vdev_is_dead() = B_FALSE l2ad_rebuild = B_TRUE * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild * is set to B_TRUE we might write additional buffers to the device. */ l2arc_rebuild_dev(dev, reopen); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *remdev = NULL; /* * Find the device by vdev */ remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); /* * Cancel any ongoing or scheduled rebuild. */ mutex_enter(&l2arc_rebuild_thr_lock); if (remdev->l2ad_rebuild_began == B_TRUE) { remdev->l2ad_rebuild_cancel = B_TRUE; while (remdev->l2ad_rebuild == B_TRUE) cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); } mutex_exit(&l2arc_rebuild_thr_lock); /* * Remove device from global list */ mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); zfs_refcount_destroy(&remdev->l2ad_lb_asize); zfs_refcount_destroy(&remdev->l2ad_lb_count); kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_rebuild_thr_lock); cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, defclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } /* * Punches out rebuild threads for the L2ARC devices in a spa. This should * be called after pool import from the spa async thread, since starting * these threads directly from spa_import() will make them part of the * "zpool import" context and delay process exit (and thus pool import). */ void l2arc_spa_rebuild_start(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Locate the spa's l2arc devices and kick off rebuild threads. */ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { l2arc_dev_t *dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); if (dev == NULL) { /* Don't attempt a rebuild if the vdev is UNAVAIL */ continue; } mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild_began = B_TRUE; (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread, dev, 0, &p0, TS_RUN, minclsyspri); } mutex_exit(&l2arc_rebuild_thr_lock); } } /* * Main entry point for L2ARC rebuilding. */ static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg) { l2arc_dev_t *dev = arg; VERIFY(!dev->l2ad_rebuild_cancel); VERIFY(dev->l2ad_rebuild); (void) l2arc_rebuild(dev); mutex_enter(&l2arc_rebuild_thr_lock); dev->l2ad_rebuild_began = B_FALSE; dev->l2ad_rebuild = B_FALSE; mutex_exit(&l2arc_rebuild_thr_lock); thread_exit(); } /* * This function implements the actual L2ARC metadata rebuild. It: * starts reading the log block chain and restores each block's contents * to memory (reconstructing arc_buf_hdr_t's). * * Operation stops under any of the following conditions: * * 1) We reach the end of the log block chain. * 2) We encounter *any* error condition (cksum errors, io errors) */ static int l2arc_rebuild(l2arc_dev_t *dev) { vdev_t *vd = dev->l2ad_vdev; spa_t *spa = vd->vdev_spa; int err = 0; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; l2arc_log_blk_phys_t *this_lb, *next_lb; zio_t *this_io = NULL, *next_io = NULL; l2arc_log_blkptr_t lbps[2]; l2arc_lb_ptr_buf_t *lb_ptr_buf; boolean_t lock_held; this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP); next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP); /* * We prevent device removal while issuing reads to the device, * then during the rebuilding phases we drop this lock again so * that a spa_unload or device remove can be initiated - this is * safe, because the spa will signal us to stop before removing * our device and wait for us to stop. */ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); lock_held = B_TRUE; /* * Retrieve the persistent L2ARC device state. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), dev->l2ad_start); dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; vd->vdev_trim_state = l2dhdr->dh_trim_state; /* * In case the zfs module parameter l2arc_rebuild_enabled is false * we do not start the rebuild process. */ if (!l2arc_rebuild_enabled) goto out; /* Prepare the rebuild process */ memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps)); /* Start the rebuild process */ for (;;) { if (!l2arc_log_blkptr_valid(dev, &lbps[0])) break; if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], this_lb, next_lb, this_io, &next_io)) != 0) goto out; /* * Our memory pressure valve. If the system is running low * on memory, rather than swamping memory with new ARC buf * hdrs, we opt not to rebuild the L2ARC. At this point, * however, we have already set up our L2ARC dev to chain in * new metadata log blocks, so the user may choose to offline/ * online the L2ARC dev at a later time (or re-import the pool) * to reconstruct it (when there's less memory pressure). */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); cmn_err(CE_NOTE, "System running low on memory, " "aborting L2ARC rebuild."); err = SET_ERROR(ENOMEM); goto out; } spa_config_exit(spa, SCL_L2ARC, vd); lock_held = B_FALSE; /* * Now that we know that the next_lb checks out alright, we * can start reconstruction from this log block. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); l2arc_log_blk_restore(dev, this_lb, asize); /* * log block restored, include its pointer in the list of * pointers to log blocks present in the L2ARC device. */ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); memcpy(lb_ptr_buf->lb_ptr, &lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(vd, asize, 0, 0); /* * Protection against loops of log blocks: * * l2ad_hand l2ad_evict * V V * l2ad_start |=======================================| l2ad_end * -----|||----|||---|||----||| * (3) (2) (1) (0) * ---|||---|||----|||---||| * (7) (6) (5) (4) * * In this situation the pointer of log block (4) passes * l2arc_log_blkptr_valid() but the log block should not be * restored as it is overwritten by the payload of log block * (0). Only log blocks (0)-(3) should be restored. We check * whether l2ad_evict lies in between the payload starting * offset of the next log block (lbps[1].lbp_payload_start) * and the payload starting offset of the present log block * (lbps[0].lbp_payload_start). If true and this isn't the * first pass, we are looping from the beginning and we should * stop. */ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, lbps[0].lbp_payload_start, dev->l2ad_evict) && !dev->l2ad_first) goto out; kpreempt(KPREEMPT_SYNC); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild = B_FALSE; cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); err = SET_ERROR(ECANCELED); goto out; } mutex_exit(&l2arc_rebuild_thr_lock); if (spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) { lock_held = B_TRUE; break; } /* * L2ARC config lock held by somebody in writer, * possibly due to them trying to remove us. They'll * likely to want us to shut down, so after a little * delay, we check l2ad_rebuild_cancel and retry * the lock again. */ delay(1); } /* * Continue with the next log block. */ lbps[0] = lbps[1]; lbps[1] = this_lb->lb_prev_lbp; PTR_SWAP(this_lb, next_lb); this_io = next_io; next_io = NULL; } if (this_io != NULL) l2arc_log_blk_fetch_abort(this_io); out: if (next_io != NULL) l2arc_log_blk_fetch_abort(next_io); vmem_free(this_lb, sizeof (*this_lb)); vmem_free(next_lb, sizeof (*next_lb)); if (!l2arc_rebuild_enabled) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "disabled"); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_success); spa_history_log_internal(spa, "L2ARC rebuild", NULL, "successful, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { /* * No error but also nothing restored, meaning the lbps array * in the device header points to invalid/non-present log * blocks. Reset the header. */ spa_history_log_internal(spa, "L2ARC rebuild", NULL, "no valid log blocks"); memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); } else if (err == ECANCELED) { /* * In case the rebuild was canceled do not log to spa history * log as the pool may be in the process of being removed. */ zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err != 0) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } if (lock_held) spa_config_exit(spa, SCL_L2ARC, vd); return (err); } /* * Attempts to read the device header on the provided L2ARC device and writes * it to `hdr'. On success, this function returns 0, otherwise the appropriate * error code is returned. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev) { int err; uint64_t guid; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; guid = spa_guid(dev->l2ad_vdev->vdev_spa); abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); if (err != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); return (err); } if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || l2dhdr->dh_spa_guid != guid || l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, l2dhdr->dh_evict) || (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && l2arc_trim_ahead > 0)) { /* * Attempt to rebuild a device containing no actual dev hdr * or containing a header from some other pool or from another * version of persistent L2ARC. */ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); return (SET_ERROR(ENOTSUP)); } return (0); } /* * Reads L2ARC log blocks from storage and validates their contents. * * This function implements a simple fetcher to make sure that while * we're processing one buffer the L2ARC is already fetching the next * one in the chain. * * The arguments this_lp and next_lp point to the current and next log block * address in the block chain. Similarly, this_lb and next_lb hold the * l2arc_log_blk_phys_t's of the current and next L2ARC blk. * * The `this_io' and `next_io' arguments are used for block fetching. * When issuing the first blk IO during rebuild, you should pass NULL for * `this_io'. This function will then issue a sync IO to read the block and * also issue an async IO to fetch the next block in the block chain. The * fetched IO is returned in `next_io'. On subsequent calls to this * function, pass the value returned in `next_io' from the previous call * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. * Prior to the call, you should initialize your `next_io' pointer to be * NULL. If no fetch IO was issued, the pointer is left set at NULL. * * On success, this function returns 0, otherwise it returns an appropriate * error code. On error the fetching IO is aborted and cleared before * returning from this function. Therefore, if we return `success', the * caller can assume that we have taken care of cleanup of fetch IOs. */ static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io) { int err = 0; zio_cksum_t cksum; abd_t *abd = NULL; uint64_t asize; ASSERT(this_lbp != NULL && next_lbp != NULL); ASSERT(this_lb != NULL && next_lb != NULL); ASSERT(next_io != NULL && *next_io == NULL); ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); /* * Check to see if we have issued the IO for this log block in a * previous run. If not, this is the first call, so issue it now. */ if (this_io == NULL) { this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, this_lb); } /* * Peek to see if we can start issuing the next IO immediately. */ if (l2arc_log_blkptr_valid(dev, next_lbp)) { /* * Start issuing IO for the next log block early - this * should help keep the L2ARC device busy while we * decompress and restore this log block. */ *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, next_lb); } /* Wait for the IO to read this log block to complete */ if ((err = zio_wait(this_io)) != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " "offset: %llu, vdev guid: %llu", err, (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid); goto cleanup; } /* * Make sure the buffer checks out. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); fletcher_4_native(this_lb, asize, NULL, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid, (u_longlong_t)dev->l2ad_hand, (u_longlong_t)dev->l2ad_evict); err = SET_ERROR(ECKSUM); goto cleanup; } /* Now we can take our time decoding this buffer */ switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: abd = abd_alloc_for_io(asize, B_TRUE); abd_copy_from_buf_off(abd, this_lb, 0, asize); if ((err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) { err = SET_ERROR(EINVAL); goto cleanup; } break; default: err = SET_ERROR(EINVAL); goto cleanup; } if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(this_lb, sizeof (*this_lb)); if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { err = SET_ERROR(EINVAL); goto cleanup; } cleanup: /* Abort an in-flight fetch I/O in case of error */ if (err != 0 && *next_io != NULL) { l2arc_log_blk_fetch_abort(*next_io); *next_io = NULL; } if (abd != NULL) abd_free(abd); return (err); } /* * Restores the payload of a log block to ARC. This creates empty ARC hdr * entries which only contain an l2arc hdr, essentially restoring the * buffers to their L2ARC evicted state. This function also updates space * usage on the L2ARC vdev to make sure it tracks restored buffers. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize) { uint64_t size = 0, asize = 0; uint64_t log_entries = dev->l2ad_log_entries; /* * Usually arc_adapt() is called only for data, not headers, but * since we may allocate significant amount of memory here, let ARC * grow its arc_c. */ arc_adapt(log_entries * HDR_L2ONLY_SIZE); for (int i = log_entries - 1; i >= 0; i--) { /* * Restore goes in the reverse temporal direction to preserve * correct temporal ordering of buffers in the l2ad_buflist. * l2arc_hdr_restore also does a list_insert_tail instead of * list_insert_head on the l2ad_buflist: * * LIST l2ad_buflist LIST * HEAD <------ (time) ------ TAIL * direction +-----+-----+-----+-----+-----+ direction * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild * fill +-----+-----+-----+-----+-----+ * ^ ^ * | | * | | * l2arc_feed_thread l2arc_rebuild * will place new bufs here restores bufs here * * During l2arc_rebuild() the device is not used by * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. */ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); asize += vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); l2arc_hdr_restore(&lb->lb_entries[i], dev); } /* * Record rebuild stats: * size Logical size of restored buffers in the L2ARC * asize Aligned size of restored buffers in the L2ARC */ ARCSTAT_INCR(arcstat_l2_rebuild_size, size); ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); } /* * Restores a single ARC buf hdr from a log entry. The ARC buffer is put * into a state indicating that it has been evicted to L2ARC. */ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) { arc_buf_hdr_t *hdr, *exists; kmutex_t *hash_lock; arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); uint64_t asize; /* * Do all the allocation before grabbing any locks, this lets us * sleep if memory is full and we don't have to deal with failed * allocations. */ hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, dev, le->le_dva, le->le_daddr, L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, L2BLK_GET_PROTECTED((le)->le_prop), L2BLK_GET_PREFETCH((le)->le_prop), L2BLK_GET_STATE((le)->le_prop)); asize = vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((le)->le_prop)); /* * vdev_space_update() has to be called before arc_hdr_destroy() to * avoid underflow since the latter also calls vdev_space_update(). */ l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); mutex_exit(&dev->l2ad_mtx); exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* Buffer was already cached, no need to restore it. */ arc_hdr_destroy(hdr); /* * If the buffer is already cached, check whether it has * L2ARC metadata. If not, enter them and update the flag. * This is important is case of onlining a cache device, since * we previously evicted all L2ARC metadata from ARC. */ if (!HDR_HAS_L2HDR(exists)) { arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); exists->b_l2hdr.b_dev = dev; exists->b_l2hdr.b_daddr = le->le_daddr; exists->b_l2hdr.b_arcs_state = L2BLK_GET_STATE((le)->le_prop); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, exists); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(exists), exists); mutex_exit(&dev->l2ad_mtx); l2arc_hdr_arcstats_increment(exists); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); } ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); } mutex_exit(hash_lock); } /* * Starts an asynchronous read IO to read a log block. This is used in log * block reconstruction to start reading the next block before we are done * decoding and reconstructing the current block, to keep the l2arc device * nice and hot with read IO to process. * The returned zio will contain a newly allocated memory buffers for the IO * data which should then be freed by the caller once the zio is no longer * needed (i.e. due to it having completed). If you wish to abort this * zio, you should do so using l2arc_log_blk_fetch_abort, which takes * care of disposing of the allocated buffers correctly. */ static zio_t * l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, l2arc_log_blk_phys_t *lb) { uint32_t asize; zio_t *pio; l2arc_read_callback_t *cb; /* L2BLK_GET_PSIZE returns aligned size for log blocks */ asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); } /* * Aborts a zio returned from l2arc_log_blk_fetch and frees the data * buffers allocated for it. */ static void l2arc_log_blk_fetch_abort(zio_t *zio) { (void) zio_wait(zio); } /* * Creates a zio to update the device header on an l2arc device. */ void l2arc_dev_hdr_update(l2arc_dev_t *dev) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; int err; VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; l2dhdr->dh_log_entries = dev->l2ad_log_entries; l2dhdr->dh_evict = dev->l2ad_evict; l2dhdr->dh_start = dev->l2ad_start; l2dhdr->dh_end = dev->l2ad_end; l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); abd_free(abd); if (err != 0) { zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); } } /* * Commits a log block to the L2ARC device. This routine is invoked from * l2arc_write_buffers when the log block fills up. * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t psize, asize; zio_t *wzio; l2arc_lb_abd_buf_t *abd_buf; uint8_t *tmpbuf = NULL; l2arc_lb_ptr_buf_t *lb_ptr_buf; VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); abd_buf = zio_buf_alloc(sizeof (*abd_buf)); abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); /* link the buffer into the block chain */ lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; lb->lb_magic = L2ARC_LOG_BLK_MAGIC; /* * l2arc_log_blk_commit() may be called multiple times during a single * l2arc_write_buffers() call. Save the allocated abd buffers in a list * so we can free them in l2arc_write_done() later on. */ list_insert_tail(&cb->l2wcb_abd_list, abd_buf); /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0); /* a log block is never entirely zero */ ASSERT(psize != 0); asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(asize <= sizeof (*lb)); /* * Update the start log block pointer in the device header to point * to the log block we're about to write. */ l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; l2dhdr->dh_start_lbps[0].lbp_payload_asize = dev->l2ad_log_blk_payload_asize; l2dhdr->dh_start_lbps[0].lbp_payload_start = dev->l2ad_log_blk_payload_start; L2BLK_SET_LSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); L2BLK_SET_PSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); L2BLK_SET_CHECKSUM( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_CHECKSUM_FLETCHER_4); if (asize < sizeof (*lb)) { /* compression succeeded */ memset(tmpbuf + psize, 0, asize - psize); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_LZ4); } else { /* compression failed */ memcpy(tmpbuf, lb, sizeof (*lb)); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_OFF); } /* checksum what we're about to write */ fletcher_4_native(tmpbuf, asize, NULL, &l2dhdr->dh_start_lbps[0].lbp_cksum); abd_free(abd_buf->abd); /* perform the write itself */ abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); (void) zio_nowait(wzio); dev->l2ad_hand += asize; /* * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. */ memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_writes); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, dev->l2ad_log_blk_payload_asize / asize); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; return (asize); } /* * Validates an L2ARC log block address to make sure that it can be read * from the provided L2ARC device. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) { /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); uint64_t end = lbp->lbp_daddr + asize - 1; uint64_t start = lbp->lbp_payload_start; boolean_t evicted = B_FALSE; /* * A log block is valid if all of the following conditions are true: * - it fits entirely (including its payload) between l2ad_start and * l2ad_end * - it has a valid size * - neither the log block itself nor part of its payload was evicted * by l2arc_evict(): * * l2ad_hand l2ad_evict * | | lbp_daddr * | start | | end * | | | | | * V V V V V * l2ad_start ============================================ l2ad_end * --------------------------|||| * ^ ^ * | log block * payload */ evicted = l2arc_range_check_overlap(start, end, dev->l2ad_hand) || l2arc_range_check_overlap(start, end, dev->l2ad_evict) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); return (start >= dev->l2ad_start && end <= dev->l2ad_end && asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && (!evicted || dev->l2ad_first)); } /* * Inserts ARC buffer header `hdr' into the current L2ARC log block on * the device. The buffer being inserted must be present in L2ARC. * Returns B_TRUE if the L2ARC log block is full and needs to be committed * to L2ARC, or B_FALSE if it still has room for more ARC buffers. */ static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_log_ent_phys_t *le; if (dev->l2ad_log_entries == 0) return (B_FALSE); int index = dev->l2ad_log_ent_idx++; ASSERT3S(index, <, dev->l2ad_log_entries); ASSERT(HDR_HAS_L2HDR(hdr)); le = &lb->lb_entries[index]; memset(le, 0, sizeof (*le)); le->le_dva = hdr->b_dva; le->le_birth = hdr->b_birth; le->le_daddr = hdr->b_l2hdr.b_daddr; if (index == 0) dev->l2ad_log_blk_payload_start = le->le_daddr; L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); le->le_complevel = hdr->b_complevel; L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); } /* * Checks whether a given L2ARC device address sits in a time-sequential * range. The trick here is that the L2ARC is a rotary buffer, so we can't * just do a range comparison, we need to handle the situation in which the * range wraps around the end of the L2ARC device. Arguments: * bottom -- Lower end of the range to check (written to earlier). * top -- Upper end of the range to check (written to later). * check -- The address for which we want to determine if it sits in * between the top and bottom. * * The 3-way conditional below represents the following cases: * * bottom < top : Sequentially ordered case: * --------+-------------------+ * | (overlap here?) | * L2ARC dev V V * |---------------============--------------| * * bottom > top: Looped-around case: * --------+------------------+ * | (overlap here?) | * L2ARC dev V V * |===============---------------===========| * ^ ^ * | (or here?) | * +---------------+--------- * * top == bottom : Just a single address comparison. */ boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) { if (bottom < top) return (bottom <= check && check <= top); else if (bottom > top) return (check <= top || bottom <= check); else return (check == top); } EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW, "Balance between metadata and data on ghost hits."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, param_get_uint, ZMOD_RW, "Seconds before growing ARC size"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim ARC to"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD, "Target average block size"); ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, "Disable compressed ARC buffers"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prefetch block in ms"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prescient prefetched block in ms"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW, "Max write bytes per interval"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW, "Extra write bytes during device warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW, "Number of max device writes to precache"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW, "Compressed l2arc_headroom multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW, "TRIM ahead L2ARC write size multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW, "Seconds between L2ARC writing"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW, "Min feed interval in milliseconds"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW, "Skip caching prefetched buffers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW, "Turbo L2ARC warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW, "No reads during writes"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW, "Percent of ARC size allowed for L2ARC-only headers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, "Rebuild the L2ARC when importing a pool"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW, "Min size in bytes to write rebuild log blocks in L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, "Cache only MFU data from ARC into L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, "Exclude dbufs on special vdevs from being cached to L2ARC if set."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "Percent of ARC meta buffers for dnodes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW, "When full, ARC allocation waits for eviction of this % of alloc size"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW, "The number of headers to evict per sublist before moving to the next"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW, "Number of arc_prune threads"); diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c index 0eb8c17e331a..8451b5082e86 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_tx.c +++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c @@ -1,1578 +1,1580 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, { "dmu_tx_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_error", KSTAT_DATA_UINT64 }, { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, { "dmu_tx_group", KSTAT_DATA_UINT64 }, { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; static kstat_t *dmu_tx_ksp; dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); tx->tx_dir = dd; if (dd != NULL) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); tx->tx_start = gethrtime(); return (tx); } dmu_tx_t * dmu_tx_create(objset_t *os) { dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; return (tx); } dmu_tx_t * dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) { dmu_tx_t *tx = dmu_tx_create_dd(NULL); TXG_VERIFY(dp->dp_spa, txg); tx->tx_pool = dp; tx->tx_txg = txg; tx->tx_anyobj = TRUE; return (tx); } int dmu_tx_is_syncing(dmu_tx_t *tx) { return (tx->tx_anyobj); } int dmu_tx_private_ok(dmu_tx_t *tx) { return (tx->tx_anyobj); } static dmu_tx_hold_t * dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) { dmu_tx_hold_t *txh; if (dn != NULL) { (void) zfs_refcount_add(&dn->dn_holds, tx); if (tx->tx_txg != 0) { mutex_enter(&dn->dn_mtx); /* * dn->dn_assigned_txg == tx->tx_txg doesn't pose a * problem, but there's no way for it to happen (for * now, at least). */ ASSERT(dn->dn_assigned_txg == 0); dn->dn_assigned_txg = tx->tx_txg; (void) zfs_refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } } txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); txh->txh_tx = tx; txh->txh_dnode = dn; zfs_refcount_create(&txh->txh_space_towrite); zfs_refcount_create(&txh->txh_memory_tohold); txh->txh_type = type; txh->txh_arg1 = arg1; txh->txh_arg2 = arg2; list_insert_tail(&tx->tx_holds, txh); return (txh); } static dmu_tx_hold_t * dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) { dnode_t *dn = NULL; dmu_tx_hold_t *txh; int err; if (object != DMU_NEW_OBJECT) { err = dnode_hold(os, object, FTAG, &dn); if (err != 0) { tx->tx_err = err; return (NULL); } } txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); if (dn != NULL) dnode_rele(dn, FTAG); return (txh); } void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) { /* * If we're syncing, they can manipulate any object anyhow, and * the hold on the dnode_t can cause problems. */ if (!dmu_tx_is_syncing(tx)) (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); } /* * This function reads specified data from disk. The specified data will * be needed to perform the transaction -- i.e, it will be read after * we do dmu_tx_assign(). There are two reasons that we read the data now * (before dmu_tx_assign()): * * 1. Reading it now has potentially better performance. The transaction * has not yet been assigned, so the TXG is not held open, and also the * caller typically has less locks held when calling dmu_tx_hold_*() than * after the transaction has been assigned. This reduces the lock (and txg) * hold times, thus reducing lock contention. * * 2. It is easier for callers (primarily the ZPL) to handle i/o errors * that are detected before they start making changes to the DMU state * (i.e. now). Once the transaction has been assigned, and some DMU * state has been changed, it can be difficult to recover from an i/o * error (e.g. to undo the changes already made in memory at the DMU * layer). Typically code to do so does not exist in the caller -- it * assumes that the data has already been cached and thus i/o errors are * not possible. * * It has been observed that the i/o initiated here can be a performance * problem, and it appears to be optional, because we don't look at the * data which is read. However, removing this read would only serve to * move the work elsewhere (after the dmu_tx_assign()), where it may * have a greater impact on performance (in addition to the impact on * fault tolerance noted above). */ static int dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) { int err; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold_level(dn, level, blkid, FTAG); + err = dbuf_hold_impl(dn, level, blkid, TRUE, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); - if (db == NULL) - return (SET_ERROR(EIO)); + if (err == ENOENT) + return (0); + if (err != 0) + return (err); /* * PARTIAL_FIRST allows caching for uncacheable blocks. It will * be cleared after dmu_buf_will_dirty() call dbuf_read() again. */ err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH | (level == 0 ? DB_RF_PARTIAL_FIRST : 0)); dbuf_rele(db, FTAG); return (err); } static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; int err = 0; if (len == 0) return; (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); if (dn == NULL) return; /* * For i/o error checking, read the blocks that will be needed * to perform the write: the first and last level-0 blocks (if * they are not aligned, i.e. if they are partial-block writes), * and all the level-1 blocks. */ if (dn->dn_maxblkid == 0) { if (off < dn->dn_datablksz && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { txh->txh_tx->tx_err = err; } } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ uint64_t start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err != 0) { txh->txh_tx->tx_err = err; } } /* last level-0 block */ uint64_t end = (off + len - 1) >> dn->dn_datablkshift; if (end != start && end <= dn->dn_maxblkid && P2PHASE(off + len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err != 0) { txh->txh_tx->tx_err = err; } } /* level-1 blocks */ if (dn->dn_nlevels > 1) { int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = (start >> shft) + 1; i < end >> shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { txh->txh_tx->tx_err = err; } } } err = zio_wait(zio); if (err != 0) { txh->txh_tx->tx_err = err; } } } static void dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; int err = 0; if (len == 0) return; (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); if (dn == NULL) return; /* * For i/o error checking, read the blocks that will be needed * to perform the append; first level-0 block (if not aligned, i.e. * if they are partial-block writes), no additional blocks are read. */ if (dn->dn_maxblkid == 0) { if (off < dn->dn_datablksz && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { txh->txh_tx->tx_err = err; } } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ uint64_t start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err != 0) { txh->txh_tx->tx_err = err; } } err = zio_wait(zio); if (err != 0) { txh->txh_tx->tx_err = err; } } } static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { (void) zfs_refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE, FTAG); } void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE, off, len); if (txh != NULL) { dmu_tx_count_write(txh, off, len); dmu_tx_count_dnode(txh); } } void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); if (txh != NULL) { dmu_tx_count_write(txh, off, len); dmu_tx_count_dnode(txh); } } /* * Should be used when appending to an object and the exact offset is unknown. * The write must occur at or beyond the specified offset. Only the L0 block * at provided offset will be prefetched. */ void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_APPEND, off, DMU_OBJECT_END); if (txh != NULL) { dmu_tx_count_append(txh, off, len); dmu_tx_count_dnode(txh); } } void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END); if (txh != NULL) { dmu_tx_count_append(txh, off, len); dmu_tx_count_dnode(txh); } } /* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and * this transaction will be able to use half of the pool space overhead * (see dsl_pool_adjustedsize()). Therefore this function should only * be called for transactions that we expect will not cause a net increase * in the amount of space used (but it's OK if that is occasionally not true). */ void dmu_tx_mark_netfree(dmu_tx_t *tx) { tx->tx_netfree = B_TRUE; } static void dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dmu_tx_t *tx = txh->txh_tx; dnode_t *dn = txh->txh_dnode; int err; ASSERT(tx->tx_txg == 0); if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; /* * For i/o error checking, we read the first and last level-0 * blocks if they are not aligned, and all the level-1 blocks. * * Note: dbuf_free_range() assumes that we have not instantiated * any level-0 dbufs that will be completely freed. Therefore we must * exercise care to not read or count the first and last blocks * if they are blocksize-aligned. */ if (dn->dn_datablkshift == 0) { if (off != 0 || len < dn->dn_datablksz) dmu_tx_count_write(txh, 0, dn->dn_datablksz); } else { /* first block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off, 1); /* last block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off + len, 1); } /* * Check level-1 blocks. */ if (dn->dn_nlevels > 1) { int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t start = off >> shift; uint64_t end = (off + len) >> shift; ASSERT(dn->dn_indblkshift != 0); /* * dnode_reallocate() can result in an object with indirect * blocks having an odd data block size. In this case, * just check the single block. */ if (dn->dn_datablkshift == 0) start = end = 0; zio_t *zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (uint64_t i = start; i <= end; i++) { uint64_t ibyte = i << shift; err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; if (err == ESRCH || i > end) break; if (err != 0) { tx->tx_err = err; (void) zio_wait(zio); return; } (void) zfs_refcount_add_many(&txh->txh_memory_tohold, 1 << dn->dn_indblkshift, FTAG); err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { tx->tx_err = err; (void) zio_wait(zio); return; } } err = zio_wait(zio); if (err != 0) { tx->tx_err = err; return; } } } void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE, off, len); if (txh != NULL) { dmu_tx_count_dnode(txh); dmu_tx_count_free(txh, off, len); } } void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); if (txh != NULL) { dmu_tx_count_dnode(txh); dmu_tx_count_free(txh, off, len); } } static void dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { /* * Reuse dmu_tx_count_free(), it does exactly what we need for clone. */ dmu_tx_count_free(txh, off, len); } void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len); if (txh != NULL) { dmu_tx_count_dnode(txh); dmu_tx_count_clone(txh, off, len); } } static void dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) { dmu_tx_t *tx = txh->txh_tx; dnode_t *dn = txh->txh_dnode; int err; extern int zap_micro_max_size; ASSERT(tx->tx_txg == 0); dmu_tx_count_dnode(txh); /* * Modifying a almost-full microzap is around the worst case (128KB) * * If it is a fat zap, the worst case would be 7*16KB=112KB: * - 3 blocks overwritten: target leaf, ptrtbl block, header block * - 4 new blocks written if adding: * - 2 blocks for possibly split leaves, * - 2 grown ptrtbl blocks */ (void) zfs_refcount_add_many(&txh->txh_space_towrite, zap_micro_max_size, FTAG); if (dn == NULL) return; ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 || name == NULL) { /* * This is a microzap (only one block), or we don't know * the name. Check the first block for i/o errors. */ err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { tx->tx_err = err; } } else { /* * Access the name so that we'll check for i/o errors to * the leaf blocks, etc. We ignore ENOENT, as this name * may not yet exist. */ err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); if (err == EIO || err == ECKSUM || err == ENXIO) { tx->tx_err = err; } } } void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP, add, (uintptr_t)name); if (txh != NULL) dmu_tx_hold_zap_impl(txh, name); } void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT(dn != NULL); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); if (txh != NULL) dmu_tx_hold_zap_impl(txh, name); } void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS, 0, 0); if (txh) dmu_tx_count_dnode(txh); } void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); if (txh) dmu_tx_count_dnode(txh); } void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, space, 0); if (txh) { (void) zfs_refcount_add_many( &txh->txh_space_towrite, space, FTAG); } } #ifdef ZFS_DEBUG void dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { boolean_t match_object = B_FALSE; boolean_t match_offset = B_FALSE; DB_DNODE_ENTER(db); dnode_t *dn = DB_DNODE(db); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); if (tx->tx_anyobj) { DB_DNODE_EXIT(db); return; } /* XXX No checking on the meta dnode for now */ if (db->db.db_object == DMU_META_DNODE_OBJECT) { DB_DNODE_EXIT(db); return; } for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) match_object = TRUE; if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { int datablkshift = dn->dn_datablkshift ? dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; int shift = datablkshift + epbs * db->db_level; uint64_t beginblk = shift >= 64 ? 0 : (txh->txh_arg1 >> shift); uint64_t endblk = shift >= 64 ? 0 : ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); uint64_t blkid = db->db_blkid; /* XXX txh_arg2 better not be zero... */ dprintf("found txh type %x beginblk=%llx endblk=%llx\n", txh->txh_type, (u_longlong_t)beginblk, (u_longlong_t)endblk); switch (txh->txh_type) { case THT_WRITE: if (blkid >= beginblk && blkid <= endblk) match_offset = TRUE; /* * We will let this hold work for the bonus * or spill buffer so that we don't need to * hold it when creating a new object. */ if (blkid == DMU_BONUS_BLKID || blkid == DMU_SPILL_BLKID) match_offset = TRUE; /* * They might have to increase nlevels, * thus dirtying the new TLIBs. Or the * might have to change the block size, * thus dirying the new lvl=0 blk=0. */ if (blkid == 0) match_offset = TRUE; break; case THT_APPEND: if (blkid >= beginblk && (blkid <= endblk || txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; /* * THT_WRITE used for bonus and spill blocks. */ ASSERT(blkid != DMU_BONUS_BLKID && blkid != DMU_SPILL_BLKID); /* * They might have to increase nlevels, * thus dirtying the new TLIBs. Or the * might have to change the block size, * thus dirying the new lvl=0 blk=0. */ if (blkid == 0) match_offset = TRUE; break; case THT_FREE: /* * We will dirty all the level 1 blocks in * the free range and perhaps the first and * last level 0 block. */ if (blkid >= beginblk && (blkid <= endblk || txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; case THT_SPILL: if (blkid == DMU_SPILL_BLKID) match_offset = TRUE; break; case THT_BONUS: if (blkid == DMU_BONUS_BLKID) match_offset = TRUE; break; case THT_ZAP: match_offset = TRUE; break; case THT_NEWOBJECT: match_object = TRUE; break; case THT_CLONE: if (blkid >= beginblk && blkid <= endblk) match_offset = TRUE; break; default: cmn_err(CE_PANIC, "bad txh_type %d", txh->txh_type); } } if (match_object && match_offset) { DB_DNODE_EXIT(db); return; } } DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); } #endif /* * If we can't do 10 iops, something is wrong. Let us go ahead * and hit zfs_dirty_data_max. */ static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ /* * We delay transactions when we've determined that the backend storage * isn't able to accommodate the rate of incoming writes. * * If there is already a transaction waiting, we delay relative to when * that transaction finishes waiting. This way the calculated min_time * is independent of the number of threads concurrently executing * transactions. * * If we are the only waiter, wait relative to when the transaction * started, rather than the current time. This credits the transaction for * "time already served", e.g. reading indirect blocks. * * The minimum time for a transaction to take is calculated as: * min_time = scale * (dirty - min) / (max - dirty) * min_time is then capped at zfs_delay_max_ns. * * The delay has two degrees of freedom that can be adjusted via tunables. * The percentage of dirty data at which we start to delay is defined by * zfs_delay_min_dirty_percent. This should typically be at or above * zfs_vdev_async_write_active_max_dirty_percent so that we only start to * delay after writing at full speed has failed to keep up with the incoming * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly * speaking, this variable determines the amount of delay at the midpoint of * the curve. * * delay * 10ms +-------------------------------------------------------------*+ * | *| * 9ms + *+ * | *| * 8ms + *+ * | * | * 7ms + * + * | * | * 6ms + * + * | * | * 5ms + * + * | * | * 4ms + * + * | * | * 3ms + * + * | * | * 2ms + (midpoint) * + * | | ** | * 1ms + v *** + * | zfs_delay_scale ----------> ******** | * 0 +-------------------------------------*********----------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note that since the delay is added to the outstanding time remaining on the * most recent transaction, the delay is effectively the inverse of IOPS. * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve * was chosen such that small changes in the amount of accumulated dirty data * in the first 3/4 of the curve yield relatively small differences in the * amount of delay. * * The effects can be easier to understand when the amount of delay is * represented on a log scale: * * delay * 100ms +-------------------------------------------------------------++ * + + * | | * + *+ * 10ms + *+ * + ** + * | (midpoint) ** | * + | ** + * 1ms + v **** + * + zfs_delay_scale ----------> ***** + * | **** | * + **** + * 100us + ** + * + * + * | * | * + * + * 10us + * + * + + * | | * + + * +--------------------------------------------------------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note here that only as the amount of dirty data approaches its limit does * the delay start to increase rapidly. The goal of a properly tuned system * should be to keep the amount of dirty data out of that range by first * ensuring that the appropriate limits are set for the I/O scheduler to reach * optimal throughput on the backend storage, and then by changing the value * of zfs_delay_scale to increase the steepness of the curve. */ static void dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; uint64_t delay_min_bytes, wrlog; hrtime_t wakeup, tx_time = 0, now; /* Calculate minimum transaction time for the dirty data amount. */ delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; if (dirty > delay_min_bytes) { /* * The caller has already waited until we are under the max. * We make them pass us the amount of dirty data so we don't * have to handle the case of it being >= the max, which * could cause a divide-by-zero if it's == the max. */ ASSERT3U(dirty, <, zfs_dirty_data_max); tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); } /* Calculate minimum transaction time for the TX_WRITE log size. */ wrlog = aggsum_upper_bound(&dp->dp_wrlog_total); delay_min_bytes = zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; if (wrlog >= zfs_wrlog_data_max) { tx_time = zfs_delay_max_ns; } else if (wrlog > delay_min_bytes) { tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) / (zfs_wrlog_data_max - wrlog), tx_time); } if (tx_time == 0) return; tx_time = MIN(tx_time, zfs_delay_max_ns); now = gethrtime(); if (now > tx->tx_start + tx_time) return; DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, uint64_t, tx_time); mutex_enter(&dp->dp_lock); wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time); dp->dp_last_wakeup = wakeup; mutex_exit(&dp->dp_lock); zfs_sleep_until(wakeup); } /* * This routine attempts to assign the transaction to a transaction group. * To do so, we must determine if there is sufficient free space on disk. * * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() * on it), then it is assumed that there is sufficient free space, * unless there's insufficient slop space in the pool (see the comment * above spa_slop_shift in spa_misc.c). * * If it is not a "netfree" transaction, then if the data already on disk * is over the allowed usage (e.g. quota), this will fail with EDQUOT or * ENOSPC. Otherwise, if the current rough estimate of pending changes, * plus the rough estimate of this transaction's changes, may exceed the * allowed usage, then this will fail with ERESTART, which will cause the * caller to wait for the pending changes to be written to disk (by waiting * for the next TXG to open), and then check the space usage again. * * The rough estimate of pending changes is comprised of the sum of: * * - this transaction's holds' txh_space_towrite * * - dd_tempreserved[], which is the sum of in-flight transactions' * holds' txh_space_towrite (i.e. those transactions that have called * dmu_tx_assign() but not yet called dmu_tx_commit()). * * - dd_space_towrite[], which is the amount of dirtied dbufs. * * Note that all of these values are inflated by spa_get_worst_case_asize(), * which means that we may get ERESTART well before we are actually in danger * of running out of space, but this also mitigates any small inaccuracies * in the rough estimate (e.g. txh_space_towrite doesn't take into account * indirect blocks, and dd_space_towrite[] doesn't take into account changes * to the MOS). * * Note that due to this algorithm, it is possible to exceed the allowed * usage by one transaction. Also, as we approach the allowed usage, * we will allow a very limited amount of changes into each TXG, thus * decreasing performance. */ static int dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) { spa_t *spa = tx->tx_pool->dp_spa; ASSERT0(tx->tx_txg); if (tx->tx_err) { DMU_TX_STAT_BUMP(dmu_tx_error); return (tx->tx_err); } if (spa_suspended(spa)) { DMU_TX_STAT_BUMP(dmu_tx_suspended); /* * If the user has indicated a blocking failure mode * then return ERESTART which will block in dmu_tx_wait(). * Otherwise, return EIO so that an error can get * propagated back to the VOP calls. * * Note that we always honor the txg_how flag regardless * of the failuremode setting. */ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && !(txg_how & TXG_WAIT)) return (SET_ERROR(EIO)); return (SET_ERROR(ERESTART)); } if (!tx->tx_dirty_delayed && dsl_pool_need_wrlog_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay); return (SET_ERROR(ERESTART)); } if (!tx->tx_dirty_delayed && dsl_pool_need_dirty_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); return (SET_ERROR(ERESTART)); } tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); tx->tx_needassign_txh = NULL; /* * NB: No error returns are allowed after txg_hold_open, but * before processing the dnode holds, due to the * dmu_tx_unassign() logic. */ uint64_t towrite = 0; uint64_t tohold = 0; for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn != NULL) { /* * This thread can't hold the dn_struct_rwlock * while assigning the tx, because this can lead to * deadlock. Specifically, if this dnode is already * assigned to an earlier txg, this thread may need * to wait for that txg to sync (the ERESTART case * below). The other thread that has assigned this * dnode to an earlier txg prevents this txg from * syncing until its tx can complete (calling * dmu_tx_commit()), but it may need to acquire the * dn_struct_rwlock to do so (e.g. via * dmu_buf_hold*()). * * Note that this thread can't hold the lock for * read either, but the rwlock doesn't record * enough information to make that assertion. */ ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock)); mutex_enter(&dn->dn_mtx); if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = txh; DMU_TX_STAT_BUMP(dmu_tx_group); return (SET_ERROR(ERESTART)); } if (dn->dn_assigned_txg == 0) dn->dn_assigned_txg = tx->tx_txg; ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); (void) zfs_refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } towrite += zfs_refcount_count(&txh->txh_space_towrite); tohold += zfs_refcount_count(&txh->txh_memory_tohold); } /* needed allocation: worst-case estimate of write space */ uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); /* calculate memory footprint estimate */ uint64_t memory = towrite + tohold; if (tx->tx_dir != NULL && asize != 0) { int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); if (err != 0) return (err); } DMU_TX_STAT_BUMP(dmu_tx_assigned); return (0); } static void dmu_tx_unassign(dmu_tx_t *tx) { if (tx->tx_txg == 0) return; txg_rele_to_quiesce(&tx->tx_txgh); /* * Walk the transaction's hold list, removing the hold on the * associated dnode, and notifying waiters if the refcount drops to 0. */ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh && txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); } txg_rele_to_sync(&tx->tx_txgh); tx->tx_lasttried_txg = tx->tx_txg; tx->tx_txg = 0; } /* * Assign tx to a transaction group; txg_how is a bitmask: * * If TXG_WAIT is set and the currently open txg is full, this function * will wait until there's a new txg. This should be used when no locks * are being held. With this bit set, this function will only fail if * we're truly out of space (or over quota). * * If TXG_WAIT is *not* set and we can't assign into the currently open * txg without blocking, this function will return immediately with * ERESTART. This should be used whenever locks are being held. On an * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), * and try again. * * If TXG_NOTHROTTLE is set, this indicates that this tx should not be * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for * details on the throttle). This is used by the VFS operations, after * they have already called dmu_tx_wait() (though most likely on a * different tx). * * It is guaranteed that subsequent successful calls to dmu_tx_assign() * will assign the tx to monotonically increasing txgs. Of course this is * not strong monotonicity, because the same txg can be returned multiple * times in a row. This guarantee holds both for subsequent calls from * one thread and for multiple threads. For example, it is impossible to * observe the following sequence of events: * * Thread 1 Thread 2 * * dmu_tx_assign(T1, ...) * 1 <- dmu_tx_get_txg(T1) * dmu_tx_assign(T2, ...) * 2 <- dmu_tx_get_txg(T2) * dmu_tx_assign(T3, ...) * 1 <- dmu_tx_get_txg(T3) */ int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) { int err; ASSERT(tx->tx_txg == 0); ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); /* If we might wait, we must not hold the config lock. */ IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); if ((txg_how & TXG_NOTHROTTLE)) tx->tx_dirty_delayed = B_TRUE; while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); if (err != ERESTART || !(txg_how & TXG_WAIT)) return (err); dmu_tx_wait(tx); } txg_rele_to_quiesce(&tx->tx_txgh); return (0); } void dmu_tx_wait(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dsl_pool_t *dp = tx->tx_pool; hrtime_t before; ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); before = gethrtime(); if (tx->tx_wait_dirty) { uint64_t dirty; /* * dmu_tx_try_assign() has determined that we need to wait * because we've consumed much or all of the dirty buffer * space. */ mutex_enter(&dp->dp_lock); if (dp->dp_dirty_total >= zfs_dirty_data_max) DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); while (dp->dp_dirty_total >= zfs_dirty_data_max) cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); dmu_tx_delay(tx, dirty); tx->tx_wait_dirty = B_FALSE; /* * Note: setting tx_dirty_delayed only has effect if the * caller used TX_WAIT. Otherwise they are going to * destroy this tx and try again. The common case, * zfs_write(), uses TX_WAIT. */ tx->tx_dirty_delayed = B_TRUE; } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { /* * If the pool is suspended we need to wait until it * is resumed. Note that it's possible that the pool * has become active after this thread has tried to * obtain a tx. If that's the case then tx_lasttried_txg * would not have been set. */ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } else if (tx->tx_needassign_txh) { dnode_t *dn = tx->tx_needassign_txh->txh_dnode; mutex_enter(&dn->dn_mtx); while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) cv_wait(&dn->dn_notxholds, &dn->dn_mtx); mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = NULL; } else { /* * If we have a lot of dirty data just wait until we sync * out a TXG at which point we'll hopefully have synced * a portion of the changes. */ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } spa_tx_assign_add_nsecs(spa, gethrtime() - before); } static void dmu_tx_destroy(dmu_tx_t *tx) { dmu_tx_hold_t *txh; while ((txh = list_head(&tx->tx_holds)) != NULL) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); zfs_refcount_destroy_many(&txh->txh_space_towrite, zfs_refcount_count(&txh->txh_space_towrite)); zfs_refcount_destroy_many(&txh->txh_memory_tohold, zfs_refcount_count(&txh->txh_memory_tohold)); kmem_free(txh, sizeof (dmu_tx_hold_t)); if (dn != NULL) dnode_rele(dn, tx); } list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); kmem_free(tx, sizeof (dmu_tx_t)); } void dmu_tx_commit(dmu_tx_t *tx) { ASSERT(tx->tx_txg != 0); /* * Go through the transaction's hold list and remove holds on * associated dnodes, notifying waiters if no holds remain. */ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); } if (tx->tx_tempreserve_cookie) dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); if (!list_is_empty(&tx->tx_callbacks)) txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); dmu_tx_destroy(tx); } void dmu_tx_abort(dmu_tx_t *tx) { ASSERT(tx->tx_txg == 0); /* * Call any registered callbacks with an error code. */ if (!list_is_empty(&tx->tx_callbacks)) dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED)); dmu_tx_destroy(tx); } uint64_t dmu_tx_get_txg(dmu_tx_t *tx) { ASSERT(tx->tx_txg != 0); return (tx->tx_txg); } dsl_pool_t * dmu_tx_pool(dmu_tx_t *tx) { ASSERT(tx->tx_pool != NULL); return (tx->tx_pool); } void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { dmu_tx_callback_t *dcb; dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); dcb->dcb_func = func; dcb->dcb_data = data; list_insert_tail(&tx->tx_callbacks, dcb); } /* * Call all the commit callbacks on a list, with a given error code. */ void dmu_tx_do_callbacks(list_t *cb_list, int error) { dmu_tx_callback_t *dcb; while ((dcb = list_remove_tail(cb_list)) != NULL) { dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } } /* * Interface to hold a bunch of attributes. * used for creating new files. * attrsize is the total size of all attributes * to be added during object creation * * For updating/adding a single attribute dmu_tx_hold_sa() should be used. */ /* * hold necessary attribute name for attribute registration. * should be a very rare case where this is needed. If it does * happen it would only happen on the first write to the file system. */ static void dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) { if (!sa->sa_need_attr_registration) return; for (int i = 0; i != sa->sa_num_attrs; i++) { if (!sa->sa_attr_table[i].sa_registered) { if (sa->sa_reg_attr_obj) dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, B_TRUE, sa->sa_attr_table[i].sa_name); else dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, sa->sa_attr_table[i].sa_name); } } } void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_SPILL, 0, 0); if (txh != NULL) (void) zfs_refcount_add_many(&txh->txh_space_towrite, SPA_OLD_MAXBLOCKSIZE, FTAG); } void dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) { sa_os_t *sa = tx->tx_objset->os_sa; dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_layout_attr_obj) { dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); } else { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) return; (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPILL, 0, 0); } /* * Hold SA attribute * * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) * * variable_size is the total size of all variable sized attributes * passed to this function. It is not the total size of all * variable size attributes that *may* exist on this object. */ void dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) { uint64_t object; sa_os_t *sa = tx->tx_objset->os_sa; ASSERT(hdl != NULL); object = sa_handle_object(hdl); dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; DB_DNODE_ENTER(db); dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db)); DB_DNODE_EXIT(db); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); if (sa->sa_force_spill || may_grow || hdl->sa_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } else { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_have_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } DB_DNODE_EXIT(db); } } void dmu_tx_init(void) { dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dmu_tx_ksp != NULL) { dmu_tx_ksp->ks_data = &dmu_tx_stats; kstat_install(dmu_tx_ksp); } } void dmu_tx_fini(void) { if (dmu_tx_ksp != NULL) { kstat_delete(dmu_tx_ksp); dmu_tx_ksp = NULL; } } #if defined(_KERNEL) EXPORT_SYMBOL(dmu_tx_create); EXPORT_SYMBOL(dmu_tx_hold_write); EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_append); EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_free); EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_zap); EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_bonus); EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode); EXPORT_SYMBOL(dmu_tx_abort); EXPORT_SYMBOL(dmu_tx_assign); EXPORT_SYMBOL(dmu_tx_wait); EXPORT_SYMBOL(dmu_tx_commit); EXPORT_SYMBOL(dmu_tx_mark_netfree); EXPORT_SYMBOL(dmu_tx_get_txg); EXPORT_SYMBOL(dmu_tx_callback_register); EXPORT_SYMBOL(dmu_tx_do_callbacks); EXPORT_SYMBOL(dmu_tx_hold_spill); EXPORT_SYMBOL(dmu_tx_hold_sa_create); EXPORT_SYMBOL(dmu_tx_hold_sa); #endif diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c index 9120fef93c74..17b971248283 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_pool.c +++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c @@ -1,1494 +1,1494 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ZFS Write Throttle * ------------------ * * ZFS must limit the rate of incoming writes to the rate at which it is able * to sync data modifications to the backend storage. Throttling by too much * creates an artificial limit; throttling by too little can only be sustained * for short periods and would lead to highly lumpy performance. On a per-pool * basis, ZFS tracks the amount of modified (dirty) data. As operations change * data, the amount of dirty data increases; as ZFS syncs out data, the amount * of dirty data decreases. When the amount of dirty data exceeds a * predetermined threshold further modifications are blocked until the amount * of dirty data decreases (as data is synced out). * * The limit on dirty data is tunable, and should be adjusted according to * both the IO capacity and available memory of the system. The larger the * window, the more ZFS is able to aggregate and amortize metadata (and data) * changes. However, memory is a limited resource, and allowing for more dirty * data comes at the cost of keeping other useful data in memory (for example * ZFS data cached by the ARC). * * Implementation * * As buffers are modified dsl_pool_willuse_space() increments both the per- * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of * dirty space used; dsl_pool_dirty_space() decrements those values as data * is synced out from dsl_pool_sync(). While only the poolwide value is * relevant, the per-txg value is useful for debugging. The tunable * zfs_dirty_data_max determines the dirty space limit. Once that value is * exceeded, new writes are halted until space frees up. * * The zfs_dirty_data_sync_percent tunable dictates the threshold at which we * ensure that there is a txg syncing (see the comment in txg.c for a full * description of transaction group stages). * * The IO scheduler uses both the dirty space limit and current amount of * dirty data as inputs. Those values affect the number of concurrent IOs ZFS * issues. See the comment in vdev_queue.c for details of the IO scheduler. * * The delay is also calculated based on the amount of dirty data. See the * comment above dmu_tx_delay() for details. */ /* * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, * capped at zfs_dirty_data_max_max. It can also be overridden with a module * parameter. */ uint64_t zfs_dirty_data_max = 0; uint64_t zfs_dirty_data_max_max = 0; uint_t zfs_dirty_data_max_percent = 10; uint_t zfs_dirty_data_max_max_percent = 25; /* * The upper limit of TX_WRITE log data. Write operations are throttled * when approaching the limit until log data is cleared out after txg sync. * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. */ uint64_t zfs_wrlog_data_max = 0; /* * If there's at least this much dirty data (as a percentage of * zfs_dirty_data_max), push out a txg. This should be less than * zfs_vdev_async_write_active_min_dirty_percent. */ static uint_t zfs_dirty_data_sync_percent = 20; /* * Once there is this amount of dirty data, the dmu_tx_delay() will kick in * and delay each transaction. * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. */ uint_t zfs_delay_min_dirty_percent = 60; /* * This controls how quickly the delay approaches infinity. * Larger values cause it to delay more for a given amount of dirty data. * Therefore larger values will cause there to be less dirty data for a * given throughput. * * For the smoothest delay, this value should be about 1 billion divided * by the maximum number of operations per second. This will smoothly * handle between 10x and 1/10th this number. * * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the * multiply in dmu_tx_delay(). */ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; /* * This determines the number of threads used by the dp_sync_taskq. */ static int zfs_sync_taskq_batch_pct = 75; /* * These tunables determine the behavior of how zil_itxg_clean() is * called via zil_clean() in the context of spa_sync(). When an itxg * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching. * If the dispatch fails, the call to zil_itxg_clean() will occur * synchronously in the context of spa_sync(), which can negatively * impact the performance of spa_sync() (e.g. in the case of the itxg * list having a large number of itxs that needs to be cleaned). * * Thus, these tunables can be used to manipulate the behavior of the * taskq used by zil_clean(); they determine the number of taskq entries * that are pre-populated when the taskq is first created (via the * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of * taskq entries that are cached after an on-demand allocation (via the * "zfs_zil_clean_taskq_maxalloc"). * * The idea being, we want to try reasonably hard to ensure there will * already be a taskq entry pre-allocated by the time that it is needed * by zil_clean(). This way, we can avoid the possibility of an * on-demand allocation of a new taskq entry from failing, which would * result in zil_itxg_clean() being called synchronously from zil_clean() * (which can adversely affect performance of spa_sync()). * * Additionally, the number of threads used by the taskq can be * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable. */ static int zfs_zil_clean_taskq_nthr_pct = 100; static int zfs_zil_clean_taskq_minalloc = 1024; static int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; int err; err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, name, sizeof (obj), 1, &obj); if (err) return (err); return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); } static dsl_pool_t * dsl_pool_open_impl(spa_t *spa, uint64_t txg) { dsl_pool_t *dp; blkptr_t *bp = spa_get_rootblkptr(spa); dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); dp->dp_spa = spa; dp->dp_meta_rootbp = *bp; rrw_init(&dp->dp_config_rwlock, B_TRUE); txg_init(dp, txg); mmp_init(spa); txg_list_create(&dp->dp_dirty_datasets, spa, offsetof(dsl_dataset_t, ds_dirty_link)); txg_list_create(&dp->dp_dirty_zilogs, spa, offsetof(zilog_t, zl_dirty_link)); txg_list_create(&dp->dp_dirty_dirs, spa, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); txg_list_create(&dp->dp_early_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); dp->dp_sync_taskq = taskq_create("dp_sync_taskq", zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, TASKQ_THREADS_CPU_PCT); dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", zfs_zil_clean_taskq_nthr_pct, minclsyspri, zfs_zil_clean_taskq_minalloc, zfs_zil_clean_taskq_maxalloc, TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); aggsum_init(&dp->dp_wrlog_total, 0); for (int i = 0; i < TXG_SIZE; i++) { aggsum_init(&dp->dp_wrlog_pertxg[i], 0); } dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain", 100, defclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); return (dp); } int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); /* * Initialize the caller's dsl_pool_t structure before we actually open * the meta objset. This is done because a self-healing write zio may * be issued as part of dmu_objset_open_impl() and the spa needs its * dsl_pool_t initialized in order to handle the write. */ *dpp = dp; err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &dp->dp_meta_objset); if (err != 0) { dsl_pool_close(dp); *dpp = NULL; } return (err); } int dsl_pool_open(dsl_pool_t *dp) { int err; dsl_dir_t *dd; dsl_dataset_t *ds; uint64_t obj; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); if (err) goto out; err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir); if (err) goto out; err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); if (err) goto out; if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; err = dsl_dataset_hold_obj(dp, dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); if (err == 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap); dsl_dataset_rele(ds, FTAG); } dsl_dir_rele(dd, dp); if (err) goto out; } if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir); if (err) goto out; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err) goto out; VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err == 0) { VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); } else if (err == ENOENT) { /* * We might not have created the remap bpobj yet. */ } else { goto out; } } /* * Note: errors ignored, because the these special dirs, used for * space accounting, are only created on demand. */ (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir); if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj); if (err != 0) goto out; } if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, &dp->dp_empty_bpobj); if (err != 0) goto out; } err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); if (err == ENOENT) err = 0; if (err) goto out; err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: rrw_exit(&dp->dp_config_rwlock, FTAG); return (err); } void dsl_pool_close(dsl_pool_t *dp) { /* * Drop our references from dsl_pool_open(). * * Since we held the origin_snap from "syncing" context (which * includes pool-opening context), it actually only got a "ref" * and not a hold, so just drop that here. */ if (dp->dp_origin_snap != NULL) dsl_dataset_rele(dp->dp_origin_snap, dp); if (dp->dp_mos_dir != NULL) dsl_dir_rele(dp->dp_mos_dir, dp); if (dp->dp_free_dir != NULL) dsl_dir_rele(dp->dp_free_dir, dp); if (dp->dp_leak_dir != NULL) dsl_dir_rele(dp->dp_leak_dir, dp); if (dp->dp_root_dir != NULL) dsl_dir_rele(dp->dp_root_dir, dp); bpobj_close(&dp->dp_free_bpobj); bpobj_close(&dp->dp_obsolete_bpobj); /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ if (dp->dp_meta_objset != NULL) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_early_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); taskq_destroy(dp->dp_zil_clean_taskq); taskq_destroy(dp->dp_sync_taskq); /* * We can't set retry to TRUE since we're explicitly specifying * a spa to flush. This is good enough; any missed buffers for * this spa won't cause trouble, and they'll eventually fall * out of the ARC just like any other unused buffer. */ arc_flush(dp->dp_spa, FALSE); mmp_fini(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); dmu_buf_user_evict_wait(); rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); ASSERT0(aggsum_value(&dp->dp_wrlog_total)); aggsum_fini(&dp->dp_wrlog_total); for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i])); aggsum_fini(&dp->dp_wrlog_pertxg[i]); } taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_zrele_taskq); if (dp->dp_blkstats != NULL) vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); } void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t obj; /* * Currently, we only create the obsolete_bpobj where there are * indirect vdevs with referenced mappings. */ ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL)); /* create and open the obsolete_bpobj */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) { spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, tx)); bpobj_free(dp->dp_meta_objset, dp->dp_obsolete_bpobj.bpo_object, tx); bpobj_close(&dp->dp_obsolete_bpobj); } dsl_pool_t * dsl_pool_create(spa_t *spa, nvlist_t *zplprops __attribute__((unused)), dsl_crypto_params_t *dcp, uint64_t txg) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); #ifdef _KERNEL objset_t *os; #else objset_t *os __attribute__((unused)); #endif dsl_dataset_t *ds; uint64_t obj; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = dmu_objset_create_impl(spa, NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); spa->spa_meta_objset = dp->dp_meta_objset; /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); ASSERT0(err); /* Initialize scan structures */ VERIFY0(dsl_scan_init(dp, txg)); /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir)); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { /* create and open the free dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* create and open the free_bplist */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) dsl_pool_create_origin(dp, tx); /* * Some features may be needed when creating the root dataset, so we * create the feature objects here. */ if (spa_version(spa) >= SPA_VERSION_FEATURES) spa_feature_create_zap_objects(spa, tx); if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF && dcp->cp_crypt != ZIO_CRYPT_INHERIT) spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx); /* create the root dataset */ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx); /* create the root objset */ VERIFY0(dsl_dataset_hold_obj_flags(dp, obj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); #ifdef _KERNEL zfs_create_fs(os, kcred, zplprops, tx); #endif dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dmu_tx_commit(tx); rrw_exit(&dp->dp_config_rwlock, FTAG); return (dp); } /* * Account for the meta-objset space in its placeholder dsl_dir. */ void dsl_pool_mos_diduse_space(dsl_pool_t *dp, int64_t used, int64_t comp, int64_t uncomp) { ASSERT3U(comp, ==, uncomp); /* it's all metadata */ mutex_enter(&dp->dp_lock); dp->dp_mos_used_delta += used; dp->dp_mos_compressed_delta += comp; dp->dp_mos_uncompressed_delta += uncomp; mutex_exit(&dp->dp_lock); } static void dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) { zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(dp->dp_meta_objset, zio, tx); VERIFY0(zio_wait(zio)); dmu_objset_sync_done(dp->dp_meta_objset, tx); taskq_wait(dp->dp_sync_taskq); multilist_destroy(&dp->dp_meta_objset->os_synced_dnodes); dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } static void dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) { ASSERT(MUTEX_HELD(&dp->dp_lock)); if (delta < 0) ASSERT3U(-delta, <=, dp->dp_dirty_total); dp->dp_dirty_total += delta; /* * Note: we signal even when increasing dp_dirty_total. * This ensures forward progress -- each thread wakes the next waiter. */ if (dp->dp_dirty_total < zfs_dirty_data_max) cv_signal(&dp->dp_spaceavail_cv); } void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) { ASSERT3S(size, >=, 0); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size); aggsum_add(&dp->dp_wrlog_total, size); /* Choose a value slightly bigger than min dirty sync bytes */ uint64_t sync_min = zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200; if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) txg_kick(dp, txg); } boolean_t dsl_pool_need_wrlog_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0); } static void dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) { int64_t delta; delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); aggsum_add(&dp->dp_wrlog_total, delta); /* Compact per-CPU sums after the big change. */ (void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); (void) aggsum_value(&dp->dp_wrlog_total); } #ifdef ZFS_DEBUG static boolean_t dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) { spa_t *spa = dp->dp_spa; vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; txg_list_t *tl = &vd->vdev_ms_list; metaslab_t *ms; for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms; ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) { VERIFY(range_tree_is_empty(ms->ms_freeing)); VERIFY(range_tree_is_empty(ms->ms_checkpointing)); } } return (B_TRUE); } #else #define dsl_early_sync_task_verify(dp, txg) \ ((void) sizeof (dp), (void) sizeof (txg), B_TRUE) #endif void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { zio_t *zio; dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; objset_t *mos = dp->dp_meta_objset; list_t synced_datasets; list_create(&synced_datasets, sizeof (dsl_dataset_t), offsetof(dsl_dataset_t, ds_synced_link)); tx = dmu_tx_create_assigned(dp, txg); /* * Run all early sync tasks before writing out any dirty blocks. * For more info on early sync tasks see block comment in * dsl_early_sync_task(). */ if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) { dsl_sync_task_t *dst; ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); while ((dst = txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) { ASSERT(dsl_early_sync_task_verify(dp, txg)); dsl_sync_task_sync(dst, tx); } ASSERT(dsl_early_sync_task_verify(dp, txg)); } /* * Write out all dirty blocks of dirty datasets. */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { /* * We must not sync any non-MOS datasets twice, because * we may have taken a snapshot of them. However, we * may sync newly-created datasets on pass 2. */ ASSERT(!list_link_active(&ds->ds_synced_link)); list_insert_tail(&synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } VERIFY0(zio_wait(zio)); /* * Update the long range free counter after * we're done syncing user data */ mutex_enter(&dp->dp_lock); ASSERT(spa_sync_pass(dp->dp_spa) == 1 || dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; mutex_exit(&dp->dp_lock); /* * After the data blocks have been written (ensured by the zio_wait() * above), update the user/group/project space accounting. This happens * in tasks dispatched to dp_sync_taskq, so wait for them before * continuing. */ for (ds = list_head(&synced_datasets); ds != NULL; ds = list_next(&synced_datasets, ds)) { dmu_objset_sync_done(ds->ds_objset, tx); } taskq_wait(dp->dp_sync_taskq); /* * Sync the datasets again to push out the changes due to * userspace updates. This must be done before we process the * sync tasks, so that any snapshots will have the correct * user accounting information (and we won't get confused * about which blocks are part of the snapshot). */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { objset_t *os = ds->ds_objset; ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); /* * Release any key mappings created by calls to * dsl_dataset_dirty() from the userquota accounting * code paths. */ if (os->os_encrypted && !os->os_raw_receive && !os->os_next_write_raw[txg & TXG_MASK]) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } } VERIFY0(zio_wait(zio)); /* * Now that the datasets have been completely synced, we can * clean up our in-memory structures accumulated while syncing: * * - move dead blocks from the pending deadlist and livelists * to the on-disk versions * - release hold from dsl_dataset_dirty() * - release key mapping hold from dsl_dataset_dirty() */ while ((ds = list_remove_head(&synced_datasets)) != NULL) { objset_t *os = ds->ds_objset; if (os->os_encrypted && !os->os_raw_receive && !os->os_next_write_raw[txg & TXG_MASK]) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } dsl_dataset_sync_done(ds, tx); dmu_buf_rele(ds->ds_dbuf, ds); } while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { dsl_dir_sync(dd, tx); } /* * The MOS's space is accounted for in the pool/$MOS * (dp_mos_dir). We can't modify the mos while we're syncing * it, so we remember the deltas and apply them here. */ if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || dp->dp_mos_uncompressed_delta != 0) { dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, dp->dp_mos_used_delta, dp->dp_mos_compressed_delta, dp->dp_mos_uncompressed_delta, tx); dp->dp_mos_used_delta = 0; dp->dp_mos_compressed_delta = 0; dp->dp_mos_uncompressed_delta = 0; } if (dmu_objset_is_dirty(mos, txg)) { dsl_pool_sync_mos(dp, tx); } /* * We have written all of the accounted dirty data, so our * dp_space_towrite should now be zero. However, some seldom-used * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up * the accounting of any dirtied space now. * * Note that, besides any dirty data from datasets, the amount of * dirty data in the MOS is also accounted by the pool. Therefore, * we want to do this cleanup after dsl_pool_sync_mos() so we don't * attempt to update the accounting for the same dirty data twice. * (i.e. at this point we only update the accounting for the space * that we know that we "leaked"). */ dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); /* * If we modify a dataset in the same txg that we want to destroy it, * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. * dsl_dir_destroy_check() will fail if there are unexpected holds. * Therefore, we want to sync the MOS (thus syncing the dd_dbuf * and clearing the hold on it) before we process the sync_tasks. * The MOS data dirtied by the sync_tasks will be synced on the next * pass. */ if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { dsl_sync_task_t *dst; /* * No more sync tasks should have been added while we * were syncing. */ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) dsl_sync_task_sync(dst, tx); } dmu_tx_commit(tx); DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); } void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { zilog_t *zilog; while ((zilog = txg_list_head(&dp->dp_dirty_zilogs, txg))) { dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); /* * We don't remove the zilog from the dp_dirty_zilogs * list until after we've cleaned it. This ensures that * callers of zilog_is_dirty() receive an accurate * answer when they are racing with the spa sync thread. */ zil_clean(zilog, txg); (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg); ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } dsl_pool_wrlog_clear(dp, txg); ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } /* * TRUE if the current thread is the tx_sync_thread or if we * are being called from SPA context during pool initialization. */ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || spa_is_initializing(dp->dp_spa) || taskq_member(dp->dp_sync_taskq, curthread)); } /* * This function returns the amount of allocatable space in the pool * minus whatever space is currently reserved by ZFS for specific * purposes. Specifically: * * 1] Any reserved SLOP space * 2] Any space used by the checkpoint * 3] Any space used for deferred frees * * The latter 2 are especially important because they are needed to * rectify the SPA's and DMU's different understanding of how much space * is used. Now the DMU is aware of that extra space tracked by the SPA * without having to maintain a separate special dir (e.g similar to * $MOS, $FREEING, and $LEAKED). * * Note: By deferred frees here, we mean the frees that were deferred * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the * segments placed in ms_defer trees during metaslab_sync_done(). */ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy) { spa_t *spa = dp->dp_spa; uint64_t space, resv, adjustedsize; uint64_t spa_deferred_frees = spa->spa_deferred_bpobj.bpo_phys->bpo_bytes; space = spa_get_dspace(spa) - spa_get_checkpoint_space(spa) - spa_deferred_frees; resv = spa_get_slop_space(spa); switch (slop_policy) { case ZFS_SPACE_CHECK_NORMAL: break; case ZFS_SPACE_CHECK_RESERVED: resv >>= 1; break; case ZFS_SPACE_CHECK_EXTRA_RESERVED: resv >>= 2; break; case ZFS_SPACE_CHECK_NONE: resv = 0; break; default: panic("invalid slop policy value: %d", slop_policy); break; } adjustedsize = (space >= resv) ? (space - resv) : 0; return (adjustedsize); } uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy) { uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy); uint64_t deferred = metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0; return (quota); } uint64_t dsl_pool_deferred_space(dsl_pool_t *dp) { return (metaslab_class_get_deferred(spa_normal_class(dp->dp_spa))); } boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - mutex_enter(&dp->dp_lock); - uint64_t dirty = dp->dp_dirty_total; - mutex_exit(&dp->dp_lock); - - return (dirty > delay_min_bytes); + /* + * We are not taking the dp_lock here and few other places, since torn + * reads are unlikely: on 64-bit systems due to register size and on + * 32-bit due to memory constraints. Pool-wide locks in hot path may + * be too expensive, while we do not need a precise result here. + */ + return (dp->dp_dirty_total > delay_min_bytes); } static boolean_t dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) { - ASSERT(MUTEX_HELD(&dp->dp_lock)); - uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; return (dirty > dirty_min_bytes); } void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { if (space > 0) { mutex_enter(&dp->dp_lock); dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; dsl_pool_dirty_delta(dp, space); boolean_t needsync = !dmu_tx_is_syncing(tx) && dsl_pool_need_dirty_sync(dp, tx->tx_txg); mutex_exit(&dp->dp_lock); if (needsync) txg_kick(dp, tx->tx_txg); } } void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) { ASSERT3S(space, >=, 0); if (space == 0) return; mutex_enter(&dp->dp_lock); if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { /* XXX writing something we didn't dirty? */ space = dp->dp_dirty_pertxg[txg & TXG_MASK]; } ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; ASSERT3U(dp->dp_dirty_total, >=, space); dsl_pool_dirty_delta(dp, -space); mutex_exit(&dp->dp_lock); } static int upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds, *prev = NULL; int err; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) break; dsl_dataset_rele(ds, FTAG); ds = prev; prev = NULL; } if (prev == NULL) { prev = dp->dp_origin_snap; /* * The $ORIGIN can't have any data, or the accounting * will be wrong. */ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); rrw_exit(&ds->ds_bp_rwlock, FTAG); /* The origin doesn't get attached to itself */ if (ds->ds_object == prev->ds_object) { dsl_dataset_rele(ds, FTAG); return (0); } dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; dsl_dataset_phys(ds)->ds_prev_snap_txg = dsl_dataset_phys(prev)->ds_creation_txg; dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; dmu_buf_will_dirty(prev->ds_dbuf, tx); dsl_dataset_phys(prev)->ds_num_children++; if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { ASSERT(ds->ds_prev == NULL); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); } } ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { dmu_buf_will_dirty(prev->ds_dbuf, tx); dsl_dataset_phys(prev)->ds_next_clones_obj = zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); dsl_dataset_rele(ds, FTAG); if (prev != dp->dp_origin_snap) dsl_dataset_rele(prev, FTAG); return (0); } void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap != NULL); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } static int upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { dmu_tx_t *tx = arg; objset_t *mos = dp->dp_meta_objset; if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { dsl_dataset_t *origin; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); dsl_dir_phys(origin->ds_dir)->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(origin->ds_dir)->dd_clones, ds->ds_object, tx)); dsl_dataset_rele(origin, FTAG); } return (0); } void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t obj; ASSERT(dmu_tx_is_syncing(tx)); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* * We can't use bpobj_alloc(), because spa_version() still * returns the old version, and we need a new-version bpobj with * subobj support. So call dmu_object_alloc() directly. */ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t dsobj; dsl_dataset_t *ds; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap == NULL); ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); /* create the origin dir, ds, & snap-ds */ dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, NULL, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); } taskq_t * dsl_pool_zrele_taskq(dsl_pool_t *dp) { return (dp->dp_zrele_taskq); } taskq_t * dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp) { return (dp->dp_unlinked_drain_taskq); } /* * Walk through the pool-wide zap object of temporary snapshot user holds * and release them. */ void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) { zap_attribute_t za; zap_cursor_t zc; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; nvlist_t *holds; if (zapobj == 0) return; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); holds = fnvlist_alloc(); for (zap_cursor_init(&zc, mos, zapobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { char *htag; nvlist_t *tags; htag = strchr(za.za_name, '-'); *htag = '\0'; ++htag; if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { tags = fnvlist_alloc(); fnvlist_add_boolean(tags, htag); fnvlist_add_nvlist(holds, za.za_name, tags); fnvlist_free(tags); } else { fnvlist_add_boolean(tags, htag); } } dsl_dataset_user_release_tmp(dp, holds); fnvlist_free(holds); zap_cursor_fini(&zc); } /* * Create the pool-wide zap object for storing temporary snapshot holds. */ static void dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) { objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; char *name; int error; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); ASSERT(dmu_tx_is_syncing(tx)); /* * If the pool was created prior to SPA_VERSION_USERREFS, the * zap object for temporary holds might not exist yet. */ if (zapobj == 0) { if (holding) { dsl_pool_user_hold_create_obj(dp, tx); zapobj = dp->dp_tmp_userrefs_obj; } else { return (SET_ERROR(ENOENT)); } } name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); if (holding) error = zap_add(mos, zapobj, name, 8, 1, &now, tx); else error = zap_remove(mos, zapobj, name, tx); kmem_strfree(name); return (error); } /* * Add a temporary hold for the given dataset object and tag. */ int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t now, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); } /* * Release a temporary hold for the given dataset object and tag. */ int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE)); } /* * DSL Pool Configuration Lock * * The dp_config_rwlock protects against changes to DSL state (e.g. dataset * creation / destruction / rename / property setting). It must be held for * read to hold a dataset or dsl_dir. I.e. you must call * dsl_pool_config_enter() or dsl_pool_hold() before calling * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock * must be held continuously until all datasets and dsl_dirs are released. * * The only exception to this rule is that if a "long hold" is placed on * a dataset, then the dp_config_rwlock may be dropped while the dataset * is still held. The long hold will prevent the dataset from being * destroyed -- the destroy will fail with EBUSY. A long hold can be * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset * (by calling dsl_{dataset,objset}_{try}own{_obj}). * * Legitimate long-holders (including owners) should be long-running, cancelable * tasks that should cause "zfs destroy" to fail. This includes DMU * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), * "zfs send", and "zfs diff". There are several other long-holders whose * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). * * The usual formula for long-holding would be: * dsl_pool_hold() * dsl_dataset_hold() * ... perform checks ... * dsl_dataset_long_hold() * dsl_pool_rele() * ... perform long-running task ... * dsl_dataset_long_rele() * dsl_dataset_rele() * * Note that when the long hold is released, the dataset is still held but * the pool is not held. The dataset may change arbitrarily during this time * (e.g. it could be destroyed). Therefore you shouldn't do anything to the * dataset except release it. * * Operations generally fall somewhere into the following taxonomy: * * Read-Only Modifying * * Dataset Layer / MOS zfs get zfs destroy * * Individual Dataset read() write() * * * Dataset Layer Operations * * Modifying operations should generally use dsl_sync_task(). The synctask * infrastructure enforces proper locking strategy with respect to the * dp_config_rwlock. See the comment above dsl_sync_task() for details. * * Read-only operations will manually hold the pool, then the dataset, obtain * information from the dataset, then release the pool and dataset. * dmu_objset_{hold,rele}() are convenience routines that also do the pool * hold/rele. * * * Operations On Individual Datasets * * Objects _within_ an objset should only be modified by the current 'owner' * of the objset to prevent incorrect concurrent modification. Thus, use * {dmu_objset,dsl_dataset}_own to mark some entity as the current owner, * and fail with EBUSY if there is already an owner. The owner can then * implement its own locking strategy, independent of the dataset layer's * locking infrastructure. * (E.g., the ZPL has its own set of locks to control concurrency. A regular * vnop will not reach into the dataset layer). * * Ideally, objects would also only be read by the objset’s owner, so that we * don’t observe state mid-modification. * (E.g. the ZPL is creating a new object and linking it into a directory; if * you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an * intermediate state. The ioctl level violates this but in pretty benign * ways, e.g. reading the zpl props object.) */ int dsl_pool_hold(const char *name, const void *tag, dsl_pool_t **dp) { spa_t *spa; int error; error = spa_open(name, &spa, tag); if (error == 0) { *dp = spa_get_dsl(spa); dsl_pool_config_enter(*dp, tag); } return (error); } void dsl_pool_rele(dsl_pool_t *dp, const void *tag) { dsl_pool_config_exit(dp, tag); spa_close(dp->dp_spa, tag); } void dsl_pool_config_enter(dsl_pool_t *dp, const void *tag) { /* * We use a "reentrant" reader-writer lock, but not reentrantly. * * The rrwlock can (with the track_all flag) track all reading threads, * which is very useful for debugging which code path failed to release * the lock, and for verifying that the *current* thread does hold * the lock. * * (Unlike a rwlock, which knows that N threads hold it for * read, but not *which* threads, so rw_held(RW_READER) returns TRUE * if any thread holds it for read, even if this thread doesn't). */ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); } void dsl_pool_config_enter_prio(dsl_pool_t *dp, const void *tag) { ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); rrw_enter_read_prio(&dp->dp_config_rwlock, tag); } void dsl_pool_config_exit(dsl_pool_t *dp, const void *tag) { rrw_exit(&dp->dp_config_rwlock, tag); } boolean_t dsl_pool_config_held(dsl_pool_t *dp) { return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); } boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp) { return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); } EXPORT_SYMBOL(dsl_pool_config_enter); EXPORT_SYMBOL(dsl_pool_config_exit); /* zfs_dirty_data_max_percent only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, UINT, ZMOD_RD, "Max percent of RAM allowed to be dirty"); /* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, UINT, ZMOD_RD, "zfs_dirty_data_max upper bound as % of RAM"); ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, UINT, ZMOD_RW, "Transaction delay threshold"); ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, U64, ZMOD_RW, "Determines the dirty space limit"); ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, U64, ZMOD_RW, "The size limit of write-transaction zil log data"); /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, U64, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW, "Dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW, "How quickly delay approaches infinity"); ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, "Max percent of CPUs that are used to sync dirty data"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW, "Max percent of CPUs that are used per dp_sync_taskq"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW, "Number of taskq entries that are pre-populated"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW, "Max number of taskq entries that are cached"); diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c index 636c04d9f785..a77874ea0dd3 100644 --- a/sys/contrib/openzfs/module/zfs/spa_config.c +++ b/sys/contrib/openzfs/module/zfs/spa_config.c @@ -1,637 +1,638 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2021, Colm Buckley */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* * Pool configuration repository. * * Pool configuration is stored as a packed nvlist on the filesystem. By * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot * (when the ZFS module is loaded). Pools can also have the 'cachefile' * property set that allows them to be stored in an alternate location until * the control of external software. * * For each cache file, we have a single nvlist which holds all the * configuration information. When the module loads, we read this information * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is * maintained independently in spa.c. Whenever the namespace is modified, or * the configuration of a pool is changed, we call spa_write_cachefile(), which * walks through all the active pools and writes the configuration to disk. */ static uint64_t spa_config_generation = 1; /* * This can be overridden in userland to preserve an alternate namespace for * userland pools when doing testing. */ char *spa_config_path = (char *)ZPOOL_CACHE; #ifdef _KERNEL static int zfs_autoimport_disable = B_TRUE; #endif /* * Called when the module is first loaded, this routine loads the configuration * file into the SPA namespace. It does not actually open or load the pools; it * only populates the namespace. */ void spa_config_load(void) { void *buf = NULL; nvlist_t *nvlist, *child; nvpair_t *nvpair; char *pathname; zfs_file_t *fp; zfs_file_attr_t zfa; uint64_t fsize; int err; #ifdef _KERNEL if (zfs_autoimport_disable) return; #endif /* * Open the configuration file. */ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); err = zfs_file_open(pathname, O_RDONLY, 0, &fp); #ifdef __FreeBSD__ if (err) err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp); #endif kmem_free(pathname, MAXPATHLEN); if (err) return; if (zfs_file_getattr(fp, &zfa)) goto out; fsize = zfa.zfa_size; buf = kmem_alloc(fsize, KM_SLEEP); /* * Read the nvlist from the file. */ if (zfs_file_read(fp, buf, fsize, NULL) < 0) goto out; /* * Unpack the nvlist. */ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) goto out; /* * Iterate over all elements in the nvlist, creating a new spa_t for * each one with the specified configuration. */ mutex_enter(&spa_namespace_lock); nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) continue; child = fnvpair_value_nvlist(nvpair); if (spa_lookup(nvpair_name(nvpair)) != NULL) continue; (void) spa_add(nvpair_name(nvpair), child, NULL); } mutex_exit(&spa_namespace_lock); nvlist_free(nvlist); out: if (buf != NULL) kmem_free(buf, fsize); zfs_file_close(fp); } static int spa_config_remove(spa_config_dirent_t *dp) { int error = 0; /* * Remove the cache file. If zfs_file_unlink() in not supported by the * platform fallback to truncating the file which is functionally * equivalent. */ error = zfs_file_unlink(dp->scd_path); if (error == EOPNOTSUPP) { int flags = O_RDWR | O_TRUNC; zfs_file_t *fp; error = zfs_file_open(dp->scd_path, flags, 0644, &fp); if (error == 0) { (void) zfs_file_fsync(fp, O_SYNC); (void) zfs_file_close(fp); } } return (error); } static int spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { size_t buflen; char *buf; int oflags = O_RDWR | O_TRUNC | O_CREAT | O_LARGEFILE; char *temp; int err; zfs_file_t *fp; /* * If the nvlist is empty (NULL), then remove the old cachefile. */ if (nvl == NULL) { err = spa_config_remove(dp); if (err == ENOENT) err = 0; return (err); } /* * Pack the configuration into a buffer. */ buf = fnvlist_pack(nvl, &buflen); temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); /* * Write the configuration to disk. Due to the complexity involved * in performing a rename and remove from within the kernel the file * is instead truncated and overwritten in place. This way we always * have a consistent view of the data or a zero length file. */ err = zfs_file_open(dp->scd_path, oflags, 0644, &fp); if (err == 0) { err = zfs_file_write(fp, buf, buflen, NULL); if (err == 0) err = zfs_file_fsync(fp, O_SYNC); zfs_file_close(fp); if (err) (void) spa_config_remove(dp); } fnvlist_pack_free(buf, buflen); kmem_free(temp, MAXPATHLEN); return (err); } /* * Synchronize pool configuration to disk. This must be called with the * namespace lock held. Synchronizing the pool cache is typically done after * the configuration has been synced to the MOS. This exposes a window where * the MOS config will have been updated but the cache file has not. If * the system were to crash at that instant then the cached config may not * contain the correct information to open the pool and an explicit import * would be required. */ void spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent, boolean_t postblkidevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; const char *pool_name; boolean_t ccw_failure; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (!(spa_mode_global & SPA_MODE_WRITE)) return; /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ ccw_failure = B_FALSE; for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; if (dp->scd_path == NULL) continue; /* * Iterate over all pools, adding any matching pools to 'nvl'. */ nvl = NULL; while ((spa = spa_next(spa)) != NULL) { /* * Skip over our own pool if we're about to remove * ourselves from the spa namespace or any pool that * is readonly. Since we cannot guarantee that a * readonly pool would successfully import upon reboot, * we don't allow them to be written to the cache file. */ if ((spa == target && removing) || !spa_writeable(spa)) continue; mutex_enter(&spa->spa_props_lock); tdp = list_head(&spa->spa_config_list); if (spa->spa_config == NULL || tdp == NULL || tdp->scd_path == NULL || strcmp(tdp->scd_path, dp->scd_path) != 0) { mutex_exit(&spa->spa_props_lock); continue; } if (nvl == NULL) nvl = fnvlist_alloc(); if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) pool_name = fnvlist_lookup_string( spa->spa_config, ZPOOL_CONFIG_POOL_NAME); else pool_name = spa_name(spa); fnvlist_add_nvlist(nvl, pool_name, spa->spa_config); mutex_exit(&spa->spa_props_lock); } error = spa_config_write(dp, nvl); if (error != 0) ccw_failure = B_TRUE; nvlist_free(nvl); } if (ccw_failure) { /* * Keep trying so that configuration data is * written if/when any temporary filesystem * resource issues are resolved. */ if (target->spa_ccw_fail_time == 0) { (void) zfs_ereport_post( FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, target, NULL, NULL, NULL, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); } else { /* * Do not rate limit future attempts to update * the config cache. */ target->spa_ccw_fail_time = 0; } /* * Remove any config entries older than the current one. */ dp = list_head(&target->spa_config_list); while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { list_remove(&target->spa_config_list, tdp); if (tdp->scd_path != NULL) spa_strfree(tdp->scd_path); kmem_free(tdp, sizeof (spa_config_dirent_t)); } spa_config_generation++; if (postsysevent) spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); /* * Post udev event to sync blkid information if the pool is created * or a new vdev is added to the pool. */ if ((target->spa_root_vdev) && postblkidevent) { vdev_post_kobj_evt(target->spa_root_vdev); for (int i = 0; i < target->spa_l2cache.sav_count; i++) vdev_post_kobj_evt(target->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < target->spa_spares.sav_count; i++) vdev_post_kobj_evt(target->spa_spares.sav_vdevs[i]); } } /* * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, * and we don't want to allow the local zone to see all the pools anyway. * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ -nvlist_t * -spa_all_configs(uint64_t *generation) +int +spa_all_configs(uint64_t *generation, nvlist_t **pools) { - nvlist_t *pools; spa_t *spa = NULL; if (*generation == spa_config_generation) - return (NULL); + return (SET_ERROR(EEXIST)); - pools = fnvlist_alloc(); + int error = mutex_enter_interruptible(&spa_namespace_lock); + if (error) + return (SET_ERROR(EINTR)); - mutex_enter(&spa_namespace_lock); + *pools = fnvlist_alloc(); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); - fnvlist_add_nvlist(pools, spa_name(spa), + fnvlist_add_nvlist(*pools, spa_name(spa), spa->spa_config); mutex_exit(&spa->spa_props_lock); } } *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); - return (pools); + return (0); } void spa_config_set(spa_t *spa, nvlist_t *config) { mutex_enter(&spa->spa_props_lock); if (spa->spa_config != NULL && spa->spa_config != config) nvlist_free(spa->spa_config); spa->spa_config = config; mutex_exit(&spa->spa_props_lock); } /* * Generate the pool's configuration based on the current in-core state. * * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ nvlist_t * spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; boolean_t locked = B_FALSE; uint64_t split_guid; const char *pool_name; if (vd == NULL) { vd = rvd; locked = B_TRUE; spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == (SCL_CONFIG | SCL_STATE)); /* * If txg is -1, report the current value of spa->spa_config_txg. */ if (txg == -1ULL) txg = spa->spa_config_txg; /* * Originally, users had to handle spa namespace collisions by either * exporting the already imported pool or by specifying a new name for * the pool with a conflicting name. In the case of root pools from * virtual guests, neither approach to collision resolution is * reasonable. This is addressed by extending the new name syntax with * an option to specify that the new name is temporary. When specified, * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us * to use the previous name, which we do below. */ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { VERIFY0(nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &pool_name)); } else pool_name = spa_name(spa); config = fnvlist_alloc(); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); if (spa->spa_compatibility != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMPATIBILITY, spa->spa_compatibility); hostid = spa_get_hostid(spa); if (hostid != 0) fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname()->nodename); int config_gen_flags = 0; if (vd != rvd) { fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_isspare) fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, 1ULL); if (vd->vdev_islog) fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 1ULL); vd = vd->vdev_top; /* label contains top config */ } else { /* * Only add the (potentially large) split information * in the mos config, and not in the vdev labels */ if (spa->spa_config_splitting != NULL) fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, spa->spa_config_splitting); fnvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS); config_gen_flags |= VDEV_CONFIG_MOS; } /* * Add the top-level config. We even add this on pools which * don't support holes in the namespace. */ vdev_top_config_generate(spa, config); /* * If we're splitting, record the original pool's guid. */ if (spa->spa_config_splitting != NULL && nvlist_lookup_uint64(spa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, split_guid); } nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags); fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); nvlist_free(nvroot); /* * Store what's necessary for reading the MOS in the label. */ fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, spa->spa_label_features); if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)); kmem_free(ddh, sizeof (ddt_histogram_t)); ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); ddt_get_dedup_object_stats(spa, ddo); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)); kmem_free(ddo, sizeof (ddt_object_t)); dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); ddt_get_dedup_stats(spa, dds); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)); kmem_free(dds, sizeof (ddt_stat_t)); } if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; int c; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); txg = spa_last_synced_txg(spa) + 1; if (what == SPA_CONFIG_UPDATE_POOL) { vdev_config_dirty(rvd); } else { /* * If we have top-level vdevs that were added but have * not yet been prepared for allocation, do that now. * (It's safe now because the config cache is up to date, * so it will be able to translate the new DVAs.) * See comments in spa_vdev_add() for full details. */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * Explicitly skip vdevs that are indirect or * log vdevs that are being removed. The reason * is that both of those can have vdev_ms_array * set to 0 and we wouldn't want to change their * metaslab size nor call vdev_expand() on them. */ if (!vdev_is_concrete(tvd) || (tvd->vdev_islog && tvd->vdev_removing)) continue; if (tvd->vdev_ms_array == 0) vdev_metaslab_set_size(tvd); vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); /* * Wait for the mosconfig to be regenerated and synced. */ txg_wait_synced(spa->spa_dsl_pool, txg); /* * Update the global config cache to reflect the new mosconfig. */ if (!spa->spa_is_root) { spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL, what != SPA_CONFIG_UPDATE_POOL); } if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } EXPORT_SYMBOL(spa_config_load); EXPORT_SYMBOL(spa_all_configs); EXPORT_SYMBOL(spa_config_set); EXPORT_SYMBOL(spa_config_generate); EXPORT_SYMBOL(spa_config_update); #ifdef __linux__ /* string sysctls require a char array on FreeBSD */ ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, "SPA config file (/etc/zfs/zpool.cache)"); #endif ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW, "Disable pool import at module load"); diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index 08d918467d03..092b3f375be0 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -1,1162 +1,1165 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * ZFS I/O Scheduler * --------------- * * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The * I/O scheduler determines when and in what order those operations are * issued. The I/O scheduler divides operations into five I/O classes * prioritized in the following order: sync read, sync write, async read, * async write, and scrub/resilver. Each queue defines the minimum and * maximum number of concurrent operations that may be issued to the device. * In addition, the device has an aggregate maximum. Note that the sum of the * per-queue minimums must not exceed the aggregate maximum. If the * sum of the per-queue maximums exceeds the aggregate maximum, then the * number of active i/os may reach zfs_vdev_max_active, in which case no * further i/os will be issued regardless of whether all per-queue * minimums have been met. * * For many physical devices, throughput increases with the number of * concurrent operations, but latency typically suffers. Further, physical * devices typically have a limit at which more concurrent operations have no * effect on throughput or can actually cause it to decrease. * * The scheduler selects the next operation to issue by first looking for an * I/O class whose minimum has not been satisfied. Once all are satisfied and * the aggregate maximum has not been hit, the scheduler looks for classes * whose maximum has not been satisfied. Iteration through the I/O classes is * done in the order specified above. No further operations are issued if the * aggregate maximum number of concurrent operations has been hit or if there * are no operations queued for an I/O class that has not hit its maximum. * Every time an i/o is queued or an operation completes, the I/O scheduler * looks for new operations to issue. * * All I/O classes have a fixed maximum number of outstanding operations * except for the async write class. Asynchronous writes represent the data * that is committed to stable storage during the syncing stage for * transaction groups (see txg.c). Transaction groups enter the syncing state * periodically so the number of queued async writes will quickly burst up and * then bleed down to zero. Rather than servicing them as quickly as possible, * the I/O scheduler changes the maximum number of active async write i/os * according to the amount of dirty data in the pool (see dsl_pool.c). Since * both throughput and latency typically increase with the number of * concurrent operations issued to physical devices, reducing the burstiness * in the number of concurrent operations also stabilizes the response time of * operations from other -- and in particular synchronous -- queues. In broad * strokes, the I/O scheduler will issue more concurrent operations from the * async write queue as there's more dirty data in the pool. * * Async Writes * * The number of concurrent operations issued for the async write I/O class * follows a piece-wise linear function defined by a few adjustable points. * * | o---------| <-- zfs_vdev_async_write_max_active * ^ | /^ | * | | / | | * active | / | | * I/O | / | | * count | / | | * | / | | * |------------o | | <-- zfs_vdev_async_write_min_active * 0|____________^______|_________| * 0% | | 100% of zfs_dirty_data_max * | | * | `-- zfs_vdev_async_write_active_max_dirty_percent * `--------- zfs_vdev_async_write_active_min_dirty_percent * * Until the amount of dirty data exceeds a minimum percentage of the dirty * data allowed in the pool, the I/O scheduler will limit the number of * concurrent operations to the minimum. As that threshold is crossed, the * number of concurrent operations issued increases linearly to the maximum at * the specified maximum percentage of the dirty data allowed in the pool. * * Ideally, the amount of dirty data on a busy pool will stay in the sloped * part of the function between zfs_vdev_async_write_active_min_dirty_percent * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the * maximum percentage, this indicates that the rate of incoming data is * greater than the rate that the backend storage can handle. In this case, we * must further throttle incoming writes (see dmu_tx_delay() for details). */ /* * The maximum number of i/os active to each device. Ideally, this will be >= * the sum of each queue's max_active. */ uint_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the * number of active i/os is < zfs_vdev_max_active, then the min_active comes * into play. We will send min_active from each queue round-robin, and then * send from queues in the order defined by zio_priority_t up to max_active. * Some queues have additional mechanisms to limit number of active I/Os in * addition to min_active and max_active, see below. * * In general, smaller max_active's will lead to lower latency of synchronous * operations. Larger max_active's may lead to higher overall throughput, * depending on underlying storage. * * The ratio of the queues' max_actives determines the balance of performance * between reads, writes, and scrubs. E.g., increasing * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete * more quickly, but reads and writes to have higher latency and lower * throughput. */ static uint_t zfs_vdev_sync_read_min_active = 10; static uint_t zfs_vdev_sync_read_max_active = 10; static uint_t zfs_vdev_sync_write_min_active = 10; static uint_t zfs_vdev_sync_write_max_active = 10; static uint_t zfs_vdev_async_read_min_active = 1; /* */ uint_t zfs_vdev_async_read_max_active = 3; static uint_t zfs_vdev_async_write_min_active = 2; /* */ uint_t zfs_vdev_async_write_max_active = 10; static uint_t zfs_vdev_scrub_min_active = 1; static uint_t zfs_vdev_scrub_max_active = 3; static uint_t zfs_vdev_removal_min_active = 1; static uint_t zfs_vdev_removal_max_active = 2; static uint_t zfs_vdev_initializing_min_active = 1; static uint_t zfs_vdev_initializing_max_active = 1; static uint_t zfs_vdev_trim_min_active = 1; static uint_t zfs_vdev_trim_max_active = 2; static uint_t zfs_vdev_rebuild_min_active = 1; static uint_t zfs_vdev_rebuild_max_active = 3; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent * dirty data, use zfs_vdev_async_write_min_active. When it has more than * zfs_vdev_async_write_active_max_dirty_percent, use * zfs_vdev_async_write_max_active. The value is linearly interpolated * between min and max. */ uint_t zfs_vdev_async_write_active_min_dirty_percent = 30; uint_t zfs_vdev_async_write_active_max_dirty_percent = 60; /* * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), * the number of concurrently-active I/O's is limited to *_min_active, unless * the vdev is "idle". When there are no interactive I/Os active (sync or * async), and zfs_vdev_nia_delay I/Os have completed since the last * interactive I/O, then the vdev is considered to be "idle", and the number * of concurrently-active non-interactive I/O's is increased to *_max_active. */ static uint_t zfs_vdev_nia_delay = 5; /* * Some HDDs tend to prioritize sequential I/O so high that concurrent * random I/O latency reaches several seconds. On some HDDs it happens * even if sequential I/Os are submitted one at a time, and so setting * *_max_active to 1 does not help. To prevent non-interactive I/Os, like * scrub, from monopolizing the device no more than zfs_vdev_nia_credit * I/Os can be sent while there are outstanding incomplete interactive * I/Os. This enforced wait ensures the HDD services the interactive I/O * within a reasonable amount of time. */ static uint_t zfs_vdev_nia_credit = 5; /* * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. * For read I/Os, we also aggregate across small adjacency gaps; for writes * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ static uint_t zfs_vdev_aggregation_limit = 1 << 20; static uint_t zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE; static uint_t zfs_vdev_read_gap_limit = 32 << 10; static uint_t zfs_vdev_write_gap_limit = 4 << 10; /* * Define the queue depth percentage for each top-level. This percentage is * used in conjunction with zfs_vdev_async_max_active to determine how many * allocations a specific top-level vdev should handle. Once the queue depth * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 * then allocator will stop allocating blocks on that top-level device. * The default kernel setting is 1000% which will yield 100 allocations per * device. For userland testing, the default setting is 300% which equates * to 30 allocations per device. */ #ifdef _KERNEL uint_t zfs_vdev_queue_depth_pct = 1000; #else uint_t zfs_vdev_queue_depth_pct = 300; #endif /* * When performing allocations for a given metaslab, we want to make sure that * there are enough IOs to aggregate together to improve throughput. We want to * ensure that there are at least 128k worth of IOs that can be aggregated, and * we assume that the average allocation size is 4k, so we need the queue depth * to be 32 per allocator to get good aggregation of sequential writes. */ uint_t zfs_vdev_def_queue_depth = 32; static int vdev_queue_offset_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; int cmp = TREE_CMP(z1->io_offset, z2->io_offset); if (likely(cmp)) return (cmp); return (TREE_PCMP(z1, z2)); } #define VDQ_T_SHIFT 29 static int vdev_queue_to_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT, z2->io_timestamp >> VDQ_T_SHIFT); int ocmp = TREE_CMP(z1->io_offset, z2->io_offset); int cmp = tcmp ? tcmp : ocmp; if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE))) return (cmp); return (TREE_PCMP(z1, z2)); } static inline boolean_t vdev_queue_class_fifo(zio_priority_t p) { return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE || p == ZIO_PRIORITY_TRIM); } static void vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) { zio_priority_t p = zio->io_priority; vq->vq_cqueued |= 1U << p; - if (vdev_queue_class_fifo(p)) + if (vdev_queue_class_fifo(p)) { list_insert_tail(&vq->vq_class[p].vqc_list, zio); + vq->vq_class[p].vqc_list_numnodes++; + } else avl_add(&vq->vq_class[p].vqc_tree, zio); } static void vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) { zio_priority_t p = zio->io_priority; uint32_t empty; if (vdev_queue_class_fifo(p)) { list_t *list = &vq->vq_class[p].vqc_list; list_remove(list, zio); empty = list_is_empty(list); + vq->vq_class[p].vqc_list_numnodes--; } else { avl_tree_t *tree = &vq->vq_class[p].vqc_tree; avl_remove(tree, zio); empty = avl_is_empty(tree); } vq->vq_cqueued &= ~(empty << p); } static uint_t vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: return (zfs_vdev_sync_read_min_active); case ZIO_PRIORITY_SYNC_WRITE: return (zfs_vdev_sync_write_min_active); case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_min_active); case ZIO_PRIORITY_ASYNC_WRITE: return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active : MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active)); case ZIO_PRIORITY_REMOVAL: return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active : MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active)); case ZIO_PRIORITY_INITIALIZING: return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active: MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active)); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); case ZIO_PRIORITY_REBUILD: return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active : MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active)); default: panic("invalid priority %u", p); return (0); } } static uint_t vdev_queue_max_async_writes(spa_t *spa) { uint_t writes; uint64_t dirty = 0; dsl_pool_t *dp = spa_get_dsl(spa); uint64_t min_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_min_dirty_percent / 100; uint64_t max_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_max_dirty_percent / 100; /* * Async writes may occur before the assignment of the spa's * dsl_pool_t if a self-healing zio is issued prior to the * completion of dmu_objset_open_impl(). */ if (dp == NULL) return (zfs_vdev_async_write_max_active); /* * Sync tasks correspond to interactive user actions. To reduce the * execution time of those actions we push data out as fast as possible. */ dirty = dp->dp_dirty_total; if (dirty > max_bytes || spa_has_pending_synctask(spa)) return (zfs_vdev_async_write_max_active); if (dirty < min_bytes) return (zfs_vdev_async_write_min_active); /* * linear interpolation: * slope = (max_writes - min_writes) / (max_bytes - min_bytes) * move right by min_bytes * move up by min_writes */ writes = (dirty - min_bytes) * (zfs_vdev_async_write_max_active - zfs_vdev_async_write_min_active) / (max_bytes - min_bytes) + zfs_vdev_async_write_min_active; ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); return (writes); } static uint_t vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: return (zfs_vdev_sync_read_max_active); case ZIO_PRIORITY_SYNC_WRITE: return (zfs_vdev_sync_write_max_active); case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa)); case ZIO_PRIORITY_SCRUB: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active)); } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) return (MAX(1, zfs_vdev_scrub_min_active)); return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_REMOVAL: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active)); } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) return (MAX(1, zfs_vdev_removal_min_active)); return (zfs_vdev_removal_max_active); case ZIO_PRIORITY_INITIALIZING: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active)); } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) return (MAX(1, zfs_vdev_initializing_min_active)); return (zfs_vdev_initializing_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); case ZIO_PRIORITY_REBUILD: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active)); } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) return (MAX(1, zfs_vdev_rebuild_min_active)); return (zfs_vdev_rebuild_max_active); default: panic("invalid priority %u", p); return (0); } } /* * Return the i/o class to issue from, or ZIO_PRIORITY_NUM_QUEUEABLE if * there is no eligible class. */ static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { uint32_t cq = vq->vq_cqueued; zio_priority_t p, p1; if (cq == 0 || vq->vq_active >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); /* * Find a queue that has not reached its minimum # outstanding i/os. * Do round-robin to reduce starvation due to zfs_vdev_max_active * and vq_nia_credit limits. */ p1 = vq->vq_last_prio + 1; if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE) p1 = 0; for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < vdev_queue_class_min_active(vq, p)) goto found; } for (p = 0; p < p1; p++) { if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < vdev_queue_class_min_active(vq, p)) goto found; } /* * If we haven't found a queue, look for one that hasn't reached its * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < vdev_queue_class_max_active(vq, p)) break; } found: vq->vq_last_prio = p; return (p); } void vdev_queue_init(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; zio_priority_t p; vq->vq_vdev = vd; for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (vdev_queue_class_fifo(p)) { list_create(&vq->vq_class[p].vqc_list, sizeof (zio_t), offsetof(struct zio, io_queue_node.l)); } else { avl_create(&vq->vq_class[p].vqc_tree, vdev_queue_to_compare, sizeof (zio_t), offsetof(struct zio, io_queue_node.a)); } } avl_create(&vq->vq_read_offset_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); avl_create(&vq->vq_write_offset_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); vq->vq_last_offset = 0; list_create(&vq->vq_active_list, sizeof (struct zio), offsetof(struct zio, io_queue_node.l)); mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); } void vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (vdev_queue_class_fifo(p)) list_destroy(&vq->vq_class[p].vqc_list); else avl_destroy(&vq->vq_class[p].vqc_tree); } avl_destroy(&vq->vq_read_offset_tree); avl_destroy(&vq->vq_write_offset_tree); list_destroy(&vq->vq_active_list); mutex_destroy(&vq->vq_lock); } static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { zio->io_queue_state = ZIO_QS_QUEUED; vdev_queue_class_add(vq, zio); if (zio->io_type == ZIO_TYPE_READ) avl_add(&vq->vq_read_offset_tree, zio); else if (zio->io_type == ZIO_TYPE_WRITE) avl_add(&vq->vq_write_offset_tree, zio); } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { vdev_queue_class_remove(vq, zio); if (zio->io_type == ZIO_TYPE_READ) avl_remove(&vq->vq_read_offset_tree, zio); else if (zio->io_type == ZIO_TYPE_WRITE) avl_remove(&vq->vq_write_offset_tree, zio); zio->io_queue_state = ZIO_QS_NONE; } static boolean_t vdev_queue_is_interactive(zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SCRUB: case ZIO_PRIORITY_REMOVAL: case ZIO_PRIORITY_INITIALIZING: case ZIO_PRIORITY_REBUILD: return (B_FALSE); default: return (B_TRUE); } } static void vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_cactive[zio->io_priority]++; vq->vq_active++; if (vdev_queue_is_interactive(zio->io_priority)) { if (++vq->vq_ia_active == 1) vq->vq_nia_credit = 1; } else if (vq->vq_ia_active > 0) { vq->vq_nia_credit--; } zio->io_queue_state = ZIO_QS_ACTIVE; list_insert_tail(&vq->vq_active_list, zio); } static void vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_cactive[zio->io_priority]--; vq->vq_active--; if (vdev_queue_is_interactive(zio->io_priority)) { if (--vq->vq_ia_active == 0) vq->vq_nia_credit = 0; else vq->vq_nia_credit = zfs_vdev_nia_credit; } else if (vq->vq_ia_active == 0) vq->vq_nia_credit++; list_remove(&vq->vq_active_list, zio); zio->io_queue_state = ZIO_QS_NONE; } static void vdev_queue_agg_io_done(zio_t *aio) { abd_free(aio->io_abd); } /* * Compute the range spanned by two i/os, which is the endpoint of the last * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. */ #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) /* * Sufficiently adjacent io_offset's in ZIOs will be aggregated. We do this * by creating a gang ABD from the adjacent ZIOs io_abd's. By using * a gang ABD we avoid doing memory copies to and from the parent, * child ZIOs. The gang ABD also accounts for gaps between adjacent * io_offsets by simply getting the zero ABD for writes or allocating * a new ABD for reads and placing them in the gang ABD as well. */ static zio_t * vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) { zio_t *first, *last, *aio, *dio, *mandatory, *nio; uint64_t maxgap = 0; uint64_t size; uint64_t limit; boolean_t stretch = B_FALSE; uint64_t next_offset; abd_t *abd; avl_tree_t *t; /* * TRIM aggregation should not be needed since code in zfs_trim.c can * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M). */ if (zio->io_type == ZIO_TYPE_TRIM) return (NULL); if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) return (NULL); if (vq->vq_vdev->vdev_nonrot) limit = zfs_vdev_aggregation_limit_non_rotating; else limit = zfs_vdev_aggregation_limit; if (limit == 0) return (NULL); limit = MIN(limit, SPA_MAXBLOCKSIZE); /* * I/Os to distributed spares are directly dispatched to the dRAID * leaf vdevs for aggregation. See the comment at the end of the * zio_vdev_io_start() function. */ ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops); first = last = zio; if (zio->io_type == ZIO_TYPE_READ) { maxgap = zfs_vdev_read_gap_limit; t = &vq->vq_read_offset_tree; } else { ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); t = &vq->vq_write_offset_tree; } /* * We can aggregate I/Os that are sufficiently adjacent and of * the same flavor, as expressed by the AGG_INHERIT flags. * The latter requirement is necessary so that certain * attributes of the I/O, such as whether it's a normal I/O * or a scrub/resilver, can be preserved in the aggregate. * We can include optional I/Os, but don't allow them * to begin a range as they add no benefit in that situation. */ /* * We keep track of the last non-optional I/O. */ mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; /* * Walk backwards through sufficiently contiguous I/Os * recording the last non-optional I/O. */ zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= limit && IO_GAP(dio, first) <= maxgap && dio->io_type == zio->io_type) { first = dio; if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = first; } /* * Skip any initial optional I/Os. */ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { first = AVL_NEXT(t, first); ASSERT(first != NULL); } /* * Walk forward through sufficiently contiguous I/Os. * The aggregation limit does not apply to optional i/os, so that * we can issue contiguous writes even if they are larger than the * aggregation limit. */ while ((dio = AVL_NEXT(t, last)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (IO_SPAN(first, dio) <= limit || (dio->io_flags & ZIO_FLAG_OPTIONAL)) && IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE && IO_GAP(last, dio) <= maxgap && dio->io_type == zio->io_type) { last = dio; if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = last; } /* * Now that we've established the range of the I/O aggregation * we must decide what to do with trailing optional I/Os. * For reads, there's nothing to do. While we are unable to * aggregate further, it's possible that a trailing optional * I/O would allow the underlying device to aggregate with * subsequent I/Os. We must therefore determine if the next * non-optional I/O is close enough to make aggregation * worthwhile. */ if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { zio_t *nio = last; while ((dio = AVL_NEXT(t, nio)) != NULL && IO_GAP(nio, dio) == 0 && IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { nio = dio; if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { stretch = B_TRUE; break; } } } if (stretch) { /* * We are going to include an optional io in our aggregated * span, thus closing the write gap. Only mandatory i/os can * start aggregated spans, so make sure that the next i/o * after our span is mandatory. */ dio = AVL_NEXT(t, last); ASSERT3P(dio, !=, NULL); dio->io_flags &= ~ZIO_FLAG_OPTIONAL; } else { /* do not include the optional i/o */ while (last != mandatory && last != first) { ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); last = AVL_PREV(t, last); ASSERT(last != NULL); } } if (first == last) return (NULL); size = IO_SPAN(first, last); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); abd = abd_alloc_gang(); if (abd == NULL) return (NULL); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, abd, size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; next_offset = first->io_offset; do { dio = nio; nio = AVL_NEXT(t, dio); ASSERT3P(dio, !=, NULL); zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); if (dio->io_offset != next_offset) { /* allocate a buffer for a read gap */ ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ); ASSERT3U(dio->io_offset, >, next_offset); abd = abd_alloc_for_io( dio->io_offset - next_offset, B_TRUE); abd_gang_add(aio->io_abd, abd, B_TRUE); } if (dio->io_abd && (dio->io_size != abd_get_size(dio->io_abd))) { /* abd size not the same as IO size */ ASSERT3U(abd_get_size(dio->io_abd), >, dio->io_size); abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size); abd_gang_add(aio->io_abd, abd, B_TRUE); } else { if (dio->io_flags & ZIO_FLAG_NODATA) { /* allocate a buffer for a write gap */ ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3P(dio->io_abd, ==, NULL); abd_gang_add(aio->io_abd, abd_get_zeros(dio->io_size), B_TRUE); } else { /* * We pass B_FALSE to abd_gang_add() * because we did not allocate a new * ABD, so it is assumed the caller * will free this ABD. */ abd_gang_add(aio->io_abd, dio->io_abd, B_FALSE); } } next_offset = dio->io_offset + dio->io_size; } while (dio != last); ASSERT3U(abd_get_size(aio->io_abd), ==, aio->io_size); /* * Callers must call zio_vdev_io_bypass() and zio_execute() for * aggregated (parent) I/Os so that we could avoid dropping the * queue's lock here to avoid a deadlock that we could encounter * due to lock order reversal between vq_lock and io_lock in * zio_change_priority(). */ return (aio); } static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq) { zio_t *zio, *aio; zio_priority_t p; avl_index_t idx; avl_tree_t *tree; again: ASSERT(MUTEX_HELD(&vq->vq_lock)); p = vdev_queue_class_to_issue(vq); if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { /* No eligible queued i/os */ return (NULL); } if (vdev_queue_class_fifo(p)) { zio = list_head(&vq->vq_class[p].vqc_list); } else { /* * For LBA-ordered queues (async / scrub / initializing), * issue the I/O which follows the most recently issued I/O * in LBA (offset) order, but to avoid starvation only within * the same 0.5 second interval as the first I/O. */ tree = &vq->vq_class[p].vqc_tree; zio = aio = avl_first(tree); if (zio->io_offset < vq->vq_last_offset) { vq->vq_io_search.io_timestamp = zio->io_timestamp; vq->vq_io_search.io_offset = vq->vq_last_offset; zio = avl_find(tree, &vq->vq_io_search, &idx); if (zio == NULL) { zio = avl_nearest(tree, idx, AVL_AFTER); if (zio == NULL || (zio->io_timestamp >> VDQ_T_SHIFT) != (aio->io_timestamp >> VDQ_T_SHIFT)) zio = aio; } } } ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); if (aio != NULL) { zio = aio; } else { vdev_queue_io_remove(vq, zio); /* * If the I/O is or was optional and therefore has no data, we * need to simply discard it. We need to drop the vdev queue's * lock to avoid a deadlock that we could encounter since this * I/O will complete immediately. */ if (zio->io_flags & ZIO_FLAG_NODATA) { mutex_exit(&vq->vq_lock); zio_vdev_io_bypass(zio); zio_execute(zio); mutex_enter(&vq->vq_lock); goto again; } } vdev_queue_pending_add(vq, zio); vq->vq_last_offset = zio->io_offset + zio->io_size; return (zio); } zio_t * vdev_queue_io(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *dio, *nio; zio_link_t *zl = NULL; if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) return (zio); /* * Children i/os inherent their parent's priority, which might * not match the child's i/o type. Fix it up here. */ if (zio->io_type == ZIO_TYPE_READ) { ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM); if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB && zio->io_priority != ZIO_PRIORITY_REMOVAL && zio->io_priority != ZIO_PRIORITY_INITIALIZING && zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } } else if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM); if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && zio->io_priority != ZIO_PRIORITY_REMOVAL && zio->io_priority != ZIO_PRIORITY_INITIALIZING && zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } } else { ASSERT(zio->io_type == ZIO_TYPE_TRIM); ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM); } zio->io_flags |= ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); mutex_enter(&vq->vq_lock); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); mutex_exit(&vq->vq_lock); if (nio == NULL) return (NULL); if (nio->io_done == vdev_queue_agg_io_done) { while ((dio = zio_walk_parents(nio, &zl)) != NULL) { ASSERT3U(dio->io_type, ==, nio->io_type); zio_vdev_io_bypass(dio); zio_execute(dio); } zio_nowait(nio); return (NULL); } return (nio); } void vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *dio, *nio; zio_link_t *zl = NULL; hrtime_t now = gethrtime(); vq->vq_io_complete_ts = now; vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp; mutex_enter(&vq->vq_lock); vdev_queue_pending_remove(vq, zio); while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { mutex_exit(&vq->vq_lock); if (nio->io_done == vdev_queue_agg_io_done) { while ((dio = zio_walk_parents(nio, &zl)) != NULL) { ASSERT3U(dio->io_type, ==, nio->io_type); zio_vdev_io_bypass(dio); zio_execute(dio); } zio_nowait(nio); } else { zio_vdev_io_reissue(nio); zio_execute(nio); } mutex_enter(&vq->vq_lock); } mutex_exit(&vq->vq_lock); } void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; /* * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio * code to issue IOs without adding them to the vdev queue. In this * case, the zio is already going to be issued as quickly as possible * and so it doesn't need any reprioritization to help. */ if (zio->io_priority == ZIO_PRIORITY_NOW) return; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (zio->io_type == ZIO_TYPE_READ) { if (priority != ZIO_PRIORITY_SYNC_READ && priority != ZIO_PRIORITY_ASYNC_READ && priority != ZIO_PRIORITY_SCRUB) priority = ZIO_PRIORITY_ASYNC_READ; } else { ASSERT(zio->io_type == ZIO_TYPE_WRITE); if (priority != ZIO_PRIORITY_SYNC_WRITE && priority != ZIO_PRIORITY_ASYNC_WRITE) priority = ZIO_PRIORITY_ASYNC_WRITE; } mutex_enter(&vq->vq_lock); /* * If the zio is in none of the queues we can simply change * the priority. If the zio is waiting to be submitted we must * remove it from the queue and re-insert it with the new priority. * Otherwise, the zio is currently active and we cannot change its * priority. */ if (zio->io_queue_state == ZIO_QS_QUEUED) { vdev_queue_class_remove(vq, zio); zio->io_priority = priority; vdev_queue_class_add(vq, zio); } else if (zio->io_queue_state == ZIO_QS_NONE) { zio->io_priority = priority; } mutex_exit(&vq->vq_lock); } /* * As these two methods are only used for load calculations we're not * concerned if we get an incorrect value on 32bit platforms due to lack of * vq_lock mutex use here, instead we prefer to keep it lock free for * performance. */ uint32_t vdev_queue_length(vdev_t *vd) { return (vd->vdev_queue.vq_active); } uint64_t vdev_queue_last_offset(vdev_t *vd) { return (vd->vdev_queue.vq_last_offset); } uint64_t vdev_queue_class_length(vdev_t *vd, zio_priority_t p) { vdev_queue_t *vq = &vd->vdev_queue; if (vdev_queue_class_fifo(p)) - return (list_is_empty(&vq->vq_class[p].vqc_list) == 0); + return (vq->vq_class[p].vqc_list_numnodes); else return (avl_numnodes(&vq->vq_class[p].vqc_tree)); } ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW, "Max vdev I/O aggregation size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT, ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW, "Aggregate read I/O over gap"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, UINT, ZMOD_RW, "Aggregate write I/O over gap"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, UINT, ZMOD_RW, "Maximum number of active I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, UINT, ZMOD_RW, "Async write concurrency max threshold"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, UINT, ZMOD_RW, "Async write concurrency min threshold"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, UINT, ZMOD_RW, "Max active async read I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, UINT, ZMOD_RW, "Min active async read I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, UINT, ZMOD_RW, "Max active async write I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, UINT, ZMOD_RW, "Min active async write I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, UINT, ZMOD_RW, "Max active initializing I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, UINT, ZMOD_RW, "Min active initializing I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, UINT, ZMOD_RW, "Max active removal I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, UINT, ZMOD_RW, "Min active removal I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, UINT, ZMOD_RW, "Max active scrub I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, UINT, ZMOD_RW, "Min active scrub I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, UINT, ZMOD_RW, "Max active sync read I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, UINT, ZMOD_RW, "Min active sync read I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, UINT, ZMOD_RW, "Max active sync write I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, UINT, ZMOD_RW, "Min active sync write I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, UINT, ZMOD_RW, "Max active trim/discard I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, UINT, ZMOD_RW, "Min active trim/discard I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, UINT, ZMOD_RW, "Max active rebuild I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, UINT, ZMOD_RW, "Min active rebuild I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, UINT, ZMOD_RW, "Number of non-interactive I/Os to allow in sequence"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, UINT, ZMOD_RW, "Number of non-interactive I/Os before _max_active"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW, "Default queue depth for each allocator"); diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index f91a2f3bbca5..2738385e260b 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -1,7930 +1,7931 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019, 2021, Klara Inc. * Copyright (c) 2019, Allan Jude */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * We want newer versions of libzfs and libzfs_core to run against * existing zfs kernel modules (i.e. a deferred reboot after an update). * Therefore the ioctl numbers cannot change from release to release. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * zfs_ioc_key_t *nvl_keys * The list of expected/allowable innvl input keys. This list is used * to validate the nvlist input to the ioctl. * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. * * IOCTL Interface Errors * * The following ioctl input errors can be returned: * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include #include #include kmutex_t zfsdev_state_lock; static zfsdev_state_t zfsdev_state_listhead; /* * Limit maximum nvlist size. We don't want users passing in insane values * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. * Defaults to 0=auto which is handled by platform code. */ uint64_t zfs_max_nvlist_src_size = 0; /* * When logging the output nvlist of an ioctl in the on-disk history, limit * the logged size to this many bytes. This must be less than DMU_MAX_ACCESS. * This applies primarily to zfs_ioc_channel_program(). */ static uint64_t zfs_history_output_max = 1024 * 1024; uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; /* DATA_TYPE_ANY is used when zkey_type can vary. */ #define DATA_TYPE_ANY DATA_TYPE_UNKNOWN typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; const zfs_ioc_key_t *zvec_nvl_keys; size_t zvec_nvl_key_count; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_PERM_PROJECTOBJQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl, (void) cr; if (INGLOBALZONE(curproc) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curproc)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; /* * First do a quick check for root in the global zone, which * is allowed to do all write_perms. This ensures that zfs_ioc_* * will get to handle nonexistent datasets. */ if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) return (0); error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, const char *strval, cred_t *cr) { #ifdef HAVE_MLSLABEL char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs_t; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static const char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, B_TRUE, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); #else return (SET_ERROR(ENOTSUP)); #endif /* HAVE_MLSLABEL */ } static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { const char *strval; /* * Check permissions for special properties. */ switch (prop) { default: break; case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curproc)) { uint64_t zoned; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ (void) innvl; return (zfs_dozonecheck(zc->zc_name, cr)); } static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; dsl_pool_t *dp; dsl_dataset_t *ds; const char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } static int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (SET_ERROR(ENOTSUP)); } static int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (SET_ERROR(ENOTSUP)); } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strlcpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); } static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvlist_t *snaps; int error = 0; nvpair_t *pair; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = (char *)nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each bookmark in the nvlist. */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = (char *)nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = (char *)nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; const char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl; if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; int error; if (secpolicy_sys_config(cr, B_FALSE) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl; return (secpolicy_zinject(cr)); } static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA || zc->zc_objset_type == ZFS_PROP_USEROBJUSED || zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED || zc->zc_objset_type == ZFS_PROP_GROUPQUOTA || zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED || zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) { if (groupmember(zc->zc_guid, cr)) return (0); } /* else is for project quota/used */ } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair; nvlist_t *holds; int error; holds = fnvlist_lookup_nvlist(innvl, "holds"); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (innvl != NULL) { if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); } return (error); } static int zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_LOAD_KEY, cr)); } static int zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CHANGE_KEY, cr)); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = vmem_alloc(size, KM_SLEEP); if (ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag) != 0) { vmem_free(packed, size); return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { vmem_free(packed, size); return (error); } vmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { error = SET_ERROR(ENOMEM); } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } int getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp) { int error = 0; if (dmu_objset_type(os) != DMU_OST_ZFS) { return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); /* bump s_active only when non-zero to prevent umount race */ error = zfs_vfs_ref(zfvp); mutex_exit(&os->os_user_ptr_lock); return (error); } int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, zfvp); dmu_objset_rele(os, FTAG); return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_sb will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all inode ops from running. */ static int zfsvfs_hold(const char *name, const void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, B_FALSE, zfvp); if (error == 0) { if (writer) ZFS_TEARDOWN_ENTER_WRITE(*zfvp, tag); else ZFS_TEARDOWN_ENTER_READ(*zfvp, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ ZFS_TEARDOWN_EXIT(*zfvp, tag); return (SET_ERROR(EBUSY)); } } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, const void *tag) { ZFS_TEARDOWN_EXIT(zfsvfs, tag); if (zfs_vfs_held(zfsvfs)) { zfs_vfs_rele(zfsvfs); } else { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; dsl_crypto_params_t *dcp = NULL; const char *spa_name = zc->zc_name; boolean_t unload_wkey = B_TRUE; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; nvlist_t *hidden_args = NULL; uint64_t version = SPA_VERSION; const char *tname; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS, &hidden_args); error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, rootprops, hidden_args, &dcp); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS); VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) spa_name = tname; } error = spa_create(zc->zc_name, config, props, zplprops, dcp); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(spa_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) { (void) spa_destroy(spa_name); unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */ } pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); dsl_crypto_params_free(dcp, unload_wkey && !!error); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; - if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) - return (SET_ERROR(EEXIST)); + error = spa_all_configs(&zc->zc_cookie, &configs); + if (error) + return (error); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config = NULL; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_flags == POOL_SCRUB_PAUSE) error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); else if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } /* * inputs: * poolname name of the pool * scan_type scan func (pool_scan_func_t) * scan_command scrub pause/resume flag (pool_scrub_cmd_t) */ static const zfs_ioc_key_t zfs_keys_pool_scrub[] = { {"scan_type", DATA_TYPE_UINT64, 0}, {"scan_command", DATA_TYPE_UINT64, 0}, }; static int zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; uint64_t scan_type, scan_cmd; if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(innvl, "scan_command", &scan_cmd) != 0) return (SET_ERROR(EINVAL)); if (scan_cmd >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); if (scan_cmd == POOL_SCRUB_PAUSE) { error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); } else if (scan_type == POOL_SCAN_NONE) { error = spa_scan_stop(spa); } else { error = spa_scan(spa, scan_type); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = vmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); vmem_free(hist_buf, size); return (error); } static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_guid guid of vdev to remove * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); if (zc->zc_cookie != 0) { error = spa_vdev_remove_cancel(spa); } else { error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL && zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_REMOVED: error = vdev_remove_wanted(spa, zc->zc_guid); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config; int replacing = zc->zc_cookie; int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, rebuild); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; const char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; const char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (!zc->zc_simple && zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading without owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) { nvlist_free(nv); return (error); } VERIFY0(error); } if (error == 0) error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os))) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && zfs_dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_src iteration range nvlist * zc_nvlist_src_size size of iteration range nvlist * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { int error; objset_t *os, *ossnap; dsl_dataset_t *ds; uint64_t min_txg = 0, max_txg = 0; if (zc->zc_nvlist_src_size != 0) { nvlist_t *props = NULL; error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props); if (error != 0) return (error); (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG, &min_txg); (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG, &max_txg); nvlist_free(props); } error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? SET_ERROR(ESRCH) : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } while (error == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { error = SET_ERROR(EINTR); break; } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == ENOENT) { error = SET_ERROR(ESRCH); break; } else if (error != 0) { break; } error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj, FTAG, &ds); if (error != 0) break; if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) || (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) { dsl_dataset_rele(ds, FTAG); /* undo snapshot name append */ *(strchr(zc->zc_name, '@') + 1) = '\0'; /* skip snapshot */ continue; } if (zc->zc_simple) { dsl_dataset_fast_stat(ds, &zc->zc_objset_stats); dsl_dataset_rele(ds, FTAG); break; } if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } dsl_dataset_rele(ds, FTAG); break; } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *dash, *domain; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; int err = -1; if (prop == ZPROP_USERPROP) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* all special properties are numeric except for keylocation */ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { strval = fnvpair_value_string(pair); } else { intval = fnvpair_value_uint64(pair); } switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_KEYLOCATION: err = dsl_crypto_can_set_keylocation(dsname, strval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_COMPRESSION: err = dsl_dataset_set_compression(dsname, source, intval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_SNAPDEV: err = zvol_set_snapdev(dsname, source, intval); break; case ZFS_PROP_VOLMODE: err = zvol_set_volmode(dsname, source, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dsname, sizeof (zc->zc_name)); (void) zfs_ioc_userspace_upgrade(zc); (void) zfs_ioc_id_quota_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } static boolean_t zfs_is_namespace_prop(zfs_prop_t prop) { switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_SETUID: case ZFS_PROP_READONLY: case ZFS_PROP_XATTR: case ZFS_PROP_NBMAND: return (B_TRUE); default: return (B_FALSE); } } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; int err; uint64_t intval; const char *strval; boolean_t should_update_mount_cache = B_FALSE; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && source == ZPROP_SRC_INHERITED) { /* inherited properties are expected to be booleans */ if (nvpair_type(propval) != DATA_TYPE_BOOLEAN) err = SET_ERROR(EINVAL); } else if (err == 0 && prop == ZPROP_USERPROP) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(ZFS_ERR_BADPROP); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { if (source == ZPROP_SRC_INHERITED) err = -1; /* does not need special handling */ else err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } if (zfs_is_namespace_prop(prop)) should_update_mount_cache = B_TRUE; } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (nvlist_empty(genericnvl)) goto out; /* * Try to set them all in one batch. */ err = dsl_props_set(dsname, source, genericnvl); if (err == 0) goto out; /* * If batching fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) { err = dsl_prop_inherit(dsname, propname, source); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } out: if (should_update_mount_cache) zfs_ioctl_update_mount_cache(dsname); nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(nvlist_t *nvl) { nvpair_t *pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (SET_ERROR(E2BIG)); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; if (!received) { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ dummy = fnvlist_alloc(); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: err = SET_ERROR(EINVAL); goto errout; } pair = nvlist_next_nvpair(dummy, NULL); if (pair == NULL) { err = SET_ERROR(EINVAL); } else { err = zfs_prop_set_special(zc->zc_name, source, pair); if (err == -1) /* property is not "special", needs handling */ err = dsl_prop_inherit(zc->zc_name, zc->zc_value, source); } errout: nvlist_free(dummy); return (err); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = SET_ERROR(EFAULT); nvlist_free(nvp); return (error); } /* * innvl: { * "vdevprops_set_vdev" -> guid * "vdevprops_set_props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_vdev_set_props[] = { {ZPOOL_VDEV_PROPS_SET_VDEV, DATA_TYPE_UINT64, 0}, {ZPOOL_VDEV_PROPS_SET_PROPS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_vdev_set_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; vdev_t *vd; uint64_t vdev_guid; /* Early validation */ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (outnvl == NULL) return (SET_ERROR(EINVAL)); if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); ASSERT(spa_writeable(spa)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_set(vd, innvl, outnvl); spa_close(spa, FTAG); return (error); } /* * innvl: { * "vdevprops_get_vdev" -> guid * (optional) "vdevprops_get_props" -> { propname -> propid } * } * * outnvl: propname -> value */ static const zfs_ioc_key_t zfs_keys_vdev_get_props[] = { {ZPOOL_VDEV_PROPS_GET_VDEV, DATA_TYPE_UINT64, 0}, {ZPOOL_VDEV_PROPS_GET_PROPS, DATA_TYPE_NVLIST, ZK_OPTIONAL} }; static int zfs_ioc_vdev_get_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; vdev_t *vd; uint64_t vdev_guid; /* Early validation */ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (outnvl == NULL) return (SET_ERROR(EINVAL)); if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_get(vd, innvl, outnvl); spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if (zfs_deleg_verify_nvlist(fsaclnv) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; int error; ASSERT(zplprops != NULL); /* parent dataset must be a filesystem */ if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[ZFS_MAX_DATASET_NAME_LEN]; spa_t *spa; uint64_t spa_vers; int error; zfs_get_parent(dataset, parentname, sizeof (parentname)); if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_create[] = { {"type", DATA_TYPE_INT32, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; nvlist_t *hidden_args = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; dsl_crypto_params_t *dcp = NULL; type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize(fsname, volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops, hidden_args, &dcp); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct); nvlist_free(zct.zct_zplprops); dsl_crypto_params_free(dcp, !!error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) { spa_t *spa; int error2; /* * Volumes will return EBUSY and cannot be destroyed * until all asynchronous minor handling (e.g. from * setting the volmode property) has completed. Wait for * the spa_zvol_taskq to drain then retry. */ error2 = dsl_destroy_head(fsname); while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) { error2 = spa_open(fsname, &spa, FTAG); if (error2 == 0) { taskq_wait(spa->spa_zvol_taskq); spa_close(spa, FTAG); } error2 = dsl_destroy_head(fsname); } } } return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outputs: * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_clone[] = { {"origin", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; const char *origin_name; origin_name = fnvlist_lookup_string(innvl, "origin"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } return (error); } static const zfs_ioc_key_t zfs_keys_remap[] = { /* no nvl keys */ }; static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { /* This IOCTL is no longer supported. */ (void) fsname, (void) innvl, (void) outnvl; return (0); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_snapshot[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if ((error = zfs_check_userprops(props)) != 0) return (error); snaps = fnvlist_lookup_nvlist(innvl, "snaps"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* * Check for permission to set the properties on the fs. */ if (!nvlist_empty(props)) { *cp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_USERPROP, CRED()); *cp = '@'; if (error != 0) return (error); } /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ static const zfs_ioc_key_t zfs_keys_log_history[] = { {"message", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { (void) unused, (void) outnvl; const char *message; char *poolname; spa_t *spa; int error; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); if (poolname == NULL) return (SET_ERROR(EINVAL)); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); kmem_strfree(poolname); if (error != 0) return (error); message = fnvlist_lookup_string(innvl, "message"); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } /* * This ioctl is used to set the bootenv configuration on the current * pool. This configuration is stored in the second padding area of the label, * and it is used by the bootloader(s) to store the bootloader and/or system * specific data. * The data is stored as nvlist data stream, and is protected by * an embedded checksum. * The version can have two possible values: * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING. * VB_NVLIST: nvlist with arbitrary pairs. */ static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { {"version", DATA_TYPE_UINT64, 0}, {"", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST}, }; static int zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int error; spa_t *spa; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { /* no nvl keys */ }; static int zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ void zfs_unmount_snap(const char *snapname) { if (strchr(snapname, '@') == NULL) return; (void) zfsctl_snapshot_unmount(snapname, MNT_FORCE); } static int zfs_unmount_snap_cb(const char *snapname, void *arg) { (void) arg; zfs_unmount_snap(snapname); return (0); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; spa_t *spa; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); defer = nvlist_exists(innvl, "defer"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); /* * The snap must be in the specified pool to prevent the * invalid removal of zvol minors below. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); zfs_unmount_snap(nvpair_name(pair)); if (spa_open(name, &spa, FTAG) == 0) { zvol_remove_minors(spa, name, B_TRUE); spa_close(spa, FTAG); } } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. The bookmark names are of the form #. * All bookmarks and snapshots must be in the same pool. * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail. * * innvl: { * new_bookmark1 -> existing_snapshot, * new_bookmark2 -> existing_bookmark, * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_bookmark[] = { {"...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, }; static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) poolname; return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL}, }; static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl is not used. * * outnvl: { * property 1, property 2, ... * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { /* no nvl keys */ }; static int zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmname; bmname = strchr(bookmark, '#'); if (bmname == NULL) return (SET_ERROR(EINVAL)); bmname++; (void) strlcpy(fsname, bookmark, sizeof (fsname)); *(strchr(fsname, '#')) = '\0'; return (dsl_get_bookmark_props(fsname, bmname, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST}, }; static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } static const zfs_ioc_key_t zfs_keys_channel_program[] = { {"program", DATA_TYPE_STRING, 0}, {"arg", DATA_TYPE_ANY, 0}, {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { const char *program; uint64_t instrlimit, memlimit; boolean_t sync_flag; nvpair_t *nvarg = NULL; program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM); if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { sync_flag = B_TRUE; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { instrlimit = ZCP_DEFAULT_INSTRLIMIT; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { memlimit = ZCP_DEFAULT_MEMLIMIT; } nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) return (SET_ERROR(EINVAL)); if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) return (SET_ERROR(EINVAL)); return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, nvarg, outnvl)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { /* no nvl keys */ }; static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; return (spa_checkpoint(poolname)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { /* no nvl keys */ }; static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; return (spa_checkpoint_discard(poolname)); } /* * inputs: * zc_name name of dataset to destroy * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; int err; err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); if (ost == DMU_OST_ZFS) zfs_unmount_snap(zc->zc_name); if (strchr(zc->zc_name, '@')) { err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); } else { err = dsl_destroy_head(zc->zc_name); if (err == EEXIST) { /* * It is possible that the given DS may have * hidden child (%recv) datasets - "leftovers" * resulting from the previously interrupted * 'zfs receive'. * * 6 extra bytes for /%recv */ char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6]; if (snprintf(namebuf, sizeof (namebuf), "%s/%s", zc->zc_name, recv_clone_name) >= sizeof (namebuf)) return (SET_ERROR(EINVAL)); /* * Try to remove the hidden child (%recv) and after * that try to remove the target dataset. * If the hidden child (%recv) does not exist * the original error (EEXIST) will be returned */ err = dsl_destroy_head(namebuf); if (err == 0) err = dsl_destroy_head(zc->zc_name); else if (err == ENOENT) err = SET_ERROR(EEXIST); } } return (err); } /* * innvl: { * "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64) * "initialize_vdevs": { -> guids to initialize (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * } * * outnvl: { * "initialize_vdevs": { -> initialization errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, &cmd_type) != 0) { return (SET_ERROR(EINVAL)); } if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_START || cmd_type == POOL_INITIALIZE_SUSPEND || cmd_type == POOL_INITIALIZE_UNINIT)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, &vdev_guids) != 0) { return (SET_ERROR(EINVAL)); } for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) { fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, vdev_errlist); } fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } /* * innvl: { * "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64) * "trim_vdevs": { -> guids to TRIM (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * "trim_rate" -> Target TRIM rate in bytes/sec. * "trim_secure" -> Set to request a secure TRIM. * } * * outnvl: { * "trim_vdevs": { -> TRIM errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_trim[] = { {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0}, {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL}, {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0) return (SET_ERROR(EINVAL)); if (!(cmd_type == POOL_TRIM_CANCEL || cmd_type == POOL_TRIM_START || cmd_type == POOL_TRIM_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0) return (SET_ERROR(EINVAL)); for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } /* Optional, defaults to maximum rate when not provided */ uint64_t rate; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0) rate = 0; /* Optional, defaults to standard TRIM when not provided */ boolean_t secure; if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE, &secure) != 0) { secure = B_FALSE; } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type, rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist); fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a tag is provided, it identifies a particular instance of an activity to * wait for. Currently, this is only valid for use with 'initialize', because * that is the only activity for which there can be multiple instances running * concurrently. In the case of 'initialize', the tag corresponds to the guid of * the vdev on which to wait. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * (optional) "wait_tag" -> uint64_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_pool_wait[] = { {ZPOOL_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, {ZPOOL_WAIT_TAG, DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; uint64_t tag; boolean_t waited; int error; if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0) return (EINVAL); if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0) error = spa_wait_tag(name, activity, tag, &waited); else error = spa_wait(name, activity, &waited); if (error == 0) fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited); return (error); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_fs_wait[] = { {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, }; static int zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; boolean_t waited = B_FALSE; int error; dsl_pool_t *dp; dsl_dir_t *dd; dsl_dataset_t *ds; if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0) return (SET_ERROR(EINVAL)); if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0) return (SET_ERROR(EINVAL)); if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0) return (error); if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) { dsl_pool_rele(dp, FTAG); return (error); } dd = ds->ds_dir; mutex_enter(&dd->dd_activity_lock); dd->dd_activity_waiters++; /* * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t * aren't evicted while we're waiting. Normally this is prevented by * holding the pool, but we can't do that while we're waiting since * that would prevent TXGs from syncing out. Some of the functionality * of long-holds (e.g. preventing deletion) is unnecessary for this * case, since we would cancel the waiters before proceeding with a * deletion. An alternative mechanism for keeping the dataset around * could be developed but this is simpler. */ dsl_dataset_long_hold(ds, FTAG); dsl_pool_rele(dp, FTAG); error = dsl_dir_wait(dd, ds, activity, &waited); dsl_dataset_long_rele(ds, FTAG); dd->dd_activity_waiters--; if (dd->dd_activity_waiters == 0) cv_signal(&dd->dd_activity_cv); mutex_exit(&dd->dd_activity_lock); dsl_dataset_rele(ds, FTAG); if (error == 0) fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited); return (error); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot * * outnvl: "target" -> name of most recent snapshot * } */ static const zfs_ioc_key_t zfs_keys_rollback[] = { {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, }; static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; zvol_state_handle_t *zv; const char *target = NULL; int error; (void) nvlist_lookup_string(innvl, "target", &target); if (target != NULL) { const char *cp = strchr(target, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); } if (getzfsvfs(fsname, &zfsvfs) == 0) { dsl_dataset_t *ds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, target, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, ds); error = error ? error : resume_err; } zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(fsname)) != NULL) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); zvol_resume(zv); } else { error = dsl_dataset_rollback(fsname, target, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char *fullname; fullname = kmem_asprintf("%s@%s", fsname, snapname); zfs_unmount_snap(fullname); kmem_strfree(fullname); return (0); } /* * * snapname is the snapshot to redact. * innvl: { * "bookname" -> (string) * shortname of the redaction bookmark to generate * "snapnv" -> (nvlist, values ignored) * snapshots to redact snapname with respect to * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_redact[] = { {"bookname", DATA_TYPE_STRING, 0}, {"snapnv", DATA_TYPE_NVLIST, 0}, }; static int zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; nvlist_t *redactnvl = NULL; const char *redactbook = NULL; if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) return (SET_ERROR(EINVAL)); if (fnvlist_num_pairs(redactnvl) == 0) return (SET_ERROR(ENXIO)); if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0) return (SET_ERROR(EINVAL)); return (dmu_redact_snap(snapname, redactnvl, redactbook)); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; boolean_t recursive = zc->zc_cookie & 1; boolean_t nounmount = !!(zc->zc_cookie & 2); char *at; int err; /* "zfs rename" from and to ...%recv datasets should both fail */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; if (ost == DMU_OST_ZFS && !nounmount) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval, compval; int err; if (prop == ZPROP_USERPROP) { if (zfs_prop_user(propname)) { if ((err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr))) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; const char *uiq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA]; const char *giq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA]; const char *pq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA]; const char *piq_prefix = zfs_userquota_prop_prefixes[\ ZFS_PROP_PROJECTOBJQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, uiq_prefix, strlen(uiq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USEROBJQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else if (strncmp(propname, giq_prefix, strlen(giq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPOBJQUOTA; } else if (strncmp(propname, pq_prefix, strlen(pq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTQUOTA; } else if (strncmp(propname, piq_prefix, strlen(piq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA; } else { /* {USER|GROUP|PROJECT}USED are read-only */ return (SET_ERROR(EINVAL)); } if ((err = zfs_secpolicy_write_perms(dsname, perm, cr))) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { compval = ZIO_COMPRESS_ALGO(intval); if (compval >= ZIO_COMPRESS_GZIP_1 && compval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (compval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (compval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } if (compval == ZIO_COMPRESS_ZSTD) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_DNODESIZE: /* Dnode sizes above 512 need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval != ZFS_DNSIZE_LEGACY) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SPECIAL_SMALL_BLOCKS: /* * This property could require the allocation classes * feature to be active for setting, however we allow * it so that tests of settable properties succeed. * The CLI will issue a warning in this case. */ break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; case ZFS_PROP_CHECKSUM: case ZFS_PROP_DEDUP: { spa_feature_t feature; spa_t *spa; int err; /* dedup feature version checks */ if (prop == ZFS_PROP_DEDUP && zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { /* check prop value is enabled in features */ feature = zio_checksum_to_feature( intval & ZIO_CHECKSUM_MASK); if (feature == SPA_FEATURE_NONE) break; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, feature)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; } default: break; } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name)); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strlcpy(zc->zc_value, nvpair_name(pair), sizeof (zc->zc_value)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { const char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, &valstr1) == 0); VERIFY(nvpair_value_string(p2, &valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } /* * Extract properties that cannot be set PRIOR to the receipt of a dataset. * For example, refquota cannot be set until after the receipt of a dataset, * because in replication streams, an older/earlier snapshot may exceed the * refquota. We want to receive the older/earlier snapshot, but setting * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent * the older/earlier snapshot from being received (with EDQUOT). * * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. * * libzfs will need to be judicious handling errors encountered by props * extracted by this function. */ static nvlist_t * extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, /* * Setting ZFS_PROP_SHARESMB requires the objset type to be * known, which is not possible prior to receipt of raw sends. */ ZFS_PROP_SHARESMB, 0 }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { /* * strcmp() is safe because zfs_prop_to_name() always returns * a bounded string. */ for (i = 0; delayable[i] != 0; i++) { if (strcmp(zfs_prop_to_name(delayable[i]), nvpair_name(nvp)) == 0) { break; } } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); VERIFY(nvlist_remove_nvpair(props, nvp) == 0); nvp = tmp; } } if (nvlist_empty(delayprops)) { nvlist_free(delayprops); delayprops = NULL; } return (delayprops); } static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; if (poolname != NULL) kmem_strfree(poolname); } #ifdef ZFS_DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * nvlist 'errors' is always allocated. It will contain descriptions of * encountered errors, if any. It's the callers responsibility to free. */ static int zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin, nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, boolean_t heal, boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { dmu_recv_cookie_t drc; int error = 0; int props_error = 0; offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; nvlist_t *inherited_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; boolean_t tofs_was_redacted; zfs_file_t *input_fp; *read_bytes = 0; *errflags = 0; *errors = fnvlist_alloc(); off = 0; if ((input_fp = zfs_file_get(input_fd)) == NULL) return (SET_ERROR(EBADF)); noff = off = zfs_file_off(input_fp); error = dmu_recv_begin(tofs, tosnap, begin_record, force, heal, resumable, localprops, hidden_args, origin, &drc, input_fp, &off); if (error != 0) goto out; tofs_was_redacted = dsl_get_redacted(drc.drc_ds); /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (recvprops != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, * so stash away the existing ones. */ if (dsl_prop_get_received(tofs, &origrecvd) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(recvprops, origrecvd); if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0) (void) nvlist_merge(*errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origrecvd, first_recvd_props ? NULL : recvprops) != 0) *errflags |= ZPROP_ERR_NOCLEAR; } else { *errflags |= ZPROP_ERR_NOCLEAR; } } /* * Stash away existing properties so we can restore them on error unless * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which * case "origrecvd" will take care of that. */ if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) { objset_t *os; if (dmu_objset_hold(tofs, FTAG, &os) == 0) { if (dsl_prop_get_all(os, &origprops) != 0) { *errflags |= ZPROP_ERR_NOCLEAR; } dmu_objset_rele(os, FTAG); } else { *errflags |= ZPROP_ERR_NOCLEAR; } } if (recvprops != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { recv_delayprops = extract_delay_props(recvprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recvprops, *errors); } } if (localprops != NULL) { nvlist_t *oprops = fnvlist_alloc(); nvlist_t *xprops = fnvlist_alloc(); nvpair_t *nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) { /* -x property */ const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); if (prop != ZPROP_USERPROP) { if (!zfs_prop_inheritable(prop)) continue; } else if (!zfs_prop_user(name)) continue; fnvlist_add_boolean(xprops, name); } else { /* -o property=value */ fnvlist_add_nvpair(oprops, nvp); } } local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); inherited_delayprops = extract_delay_props(xprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); nvlist_free(oprops); nvlist_free(xprops); } error = dmu_recv_stream(&drc, &off); if (error == 0) { zfsvfs_t *zfsvfs = NULL; zvol_state_handle_t *zv = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ dsl_dataset_t *ds; int end_err; boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS( begin_record->drr_u.drr_begin. drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); /* * If the dataset was not redacted, but we received a * redacted stream onto it, we need to unmount the * dataset. Otherwise, resume the filesystem. */ if (error == 0 && !drc.drc_newfs && stream_is_redacted && !tofs_was_redacted) { error = zfs_end_fs(zfsvfs, ds); } else if (error == 0) { error = zfs_resume_fs(zfsvfs, ds); } error = error ? error : end_err; zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(tofs)) != NULL) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); } else { error = dmu_recv_end(&drc, NULL); } /* Set delayed properties now, after we're done receiving. */ if (recv_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recv_delayprops, *errors); } if (local_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } if (inherited_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inherited_delayprops, *errors); } } /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means * we have to make sure clear_received_props() includes * the delayed properties). * * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, * using ASSERT() will be just like a VERIFY. */ if (recv_delayprops != NULL) { ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0); nvlist_free(recv_delayprops); } if (local_delayprops != NULL) { ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } if (inherited_delayprops != NULL) { ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0); nvlist_free(inherited_delayprops); } *read_bytes = off - noff; #ifdef ZFS_DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif /* * On error, restore the original props. */ if (error != 0 && recvprops != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, recvprops, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ *errflags |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origrecvd == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explicitly if we're restoring local properties cleared in the * first new-style receive. */ if (origrecvd != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origrecvd, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ *errflags |= ZPROP_ERR_NORESTORE; } } if (error != 0 && localprops != NULL && !drc.drc_newfs && !first_recvd_props) { nvlist_t *setprops; nvlist_t *inheritprops; nvpair_t *nvp; if (origprops == NULL) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; goto out; } /* Restore original props */ setprops = fnvlist_alloc(); inheritprops = fnvlist_alloc(); nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); const char *source; nvlist_t *attrs; if (!nvlist_exists(origprops, name)) { /* * Property was not present or was explicitly * inherited before the receive, restore this. */ fnvlist_add_boolean(inheritprops, name); continue; } attrs = fnvlist_lookup_nvlist(origprops, name); source = fnvlist_lookup_string(attrs, ZPROP_SOURCE); /* Skip received properties */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) continue; if (strcmp(source, tofs) == 0) { /* Property was locally set */ fnvlist_add_nvlist(setprops, name, attrs); } else { /* Property was implicitly inherited */ fnvlist_add_boolean(inheritprops, name); } } if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; nvlist_free(setprops); nvlist_free(inheritprops); } out: zfs_file_put(input_fp); nvlist_free(origrecvd); nvlist_free(origprops); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of containing filesystem (unused) * zc_nvlist_src{_size} nvlist of properties to apply * zc_nvlist_conf{_size} nvlist of properties to exclude * (DATA_TYPE_BOOLEAN) and override (everything else) * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * * outputs: * zc_cookie number of bytes read * zc_obj zprop_errflags_t * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_recv(zfs_cmd_t *zc) { dmu_replay_record_t begin_record; nvlist_t *errors = NULL; nvlist_t *recvdprops = NULL; nvlist_t *localprops = NULL; const char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; int error = 0; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &recvdprops)) != 0) return (error); if (zc->zc_nvlist_conf != 0 && (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &localprops)) != 0) return (error); if (zc->zc_string[0]) origin = zc->zc_string; begin_record.drr_type = DRR_BEGIN; begin_record.drr_payloadlen = 0; begin_record.drr_u.drr_begin = zc->zc_begin_record; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); nvlist_free(recvdprops); nvlist_free(localprops); /* * Now that all props, initial and delayed, are set, report the prop * errors to the caller. */ if (zc->zc_nvlist_dst_size != 0 && errors != NULL && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ error = SET_ERROR(EINVAL); } nvlist_free(errors); return (error); } /* * innvl: { * "snapname" -> full name of the snapshot to create * (optional) "props" -> received properties to set (nvlist) * (optional) "localprops" -> override and exclude properties (nvlist) * (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE) * "begin_record" -> non-byteswapped dmu_replay_record_t * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) * (optional) "heal" -> use send stream to heal data corruption * (optional) "resumable" -> resumable flag (value ignored) * (optional) "cleanup_fd" -> unused * (optional) "action_handle" -> unused * (optional) "hidden_args" -> { "wkeydata" -> value } * } * * outnvl: { * "read_bytes" -> number of bytes read * "error_flags" -> zprop_errflags_t * "errors" -> error for each unapplied received property (nvlist) * } */ static const zfs_ioc_key_t zfs_keys_recv_new[] = { {"snapname", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"localprops", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"origin", DATA_TYPE_STRING, ZK_OPTIONAL}, {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, {"input_fd", DATA_TYPE_INT32, 0}, {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"heal", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { dmu_replay_record_t *begin_record; uint_t begin_record_size; nvlist_t *errors = NULL; nvlist_t *recvprops = NULL; nvlist_t *localprops = NULL; nvlist_t *hidden_args = NULL; const char *snapname; const char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; boolean_t heal; boolean_t resumable; uint64_t read_bytes = 0; uint64_t errflags = 0; int input_fd = -1; int error; snapname = fnvlist_lookup_string(innvl, "snapname"); if (dataset_namecheck(snapname, NULL, NULL) != 0 || strchr(snapname, '@') == NULL || strchr(snapname, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; error = nvlist_lookup_string(innvl, "origin", &origin); if (error && error != ENOENT) return (error); error = nvlist_lookup_byte_array(innvl, "begin_record", (uchar_t **)&begin_record, &begin_record_size); if (error != 0 || begin_record_size != sizeof (*begin_record)) return (SET_ERROR(EINVAL)); input_fd = fnvlist_lookup_int32(innvl, "input_fd"); force = nvlist_exists(innvl, "force"); heal = nvlist_exists(innvl, "heal"); resumable = nvlist_exists(innvl, "resumable"); /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); if (error && error != ENOENT) return (error); error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, hidden_args, force, heal, resumable, input_fd, begin_record, &read_bytes, &errflags, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); fnvlist_add_uint64(outnvl, "error_flags", errflags); fnvlist_add_nvlist(outnvl, "errors", errors); nvlist_free(errors); nvlist_free(recvprops); nvlist_free(localprops); return (error); } typedef struct dump_bytes_io { zfs_file_t *dbi_fp; caddr_t dbi_buf; int dbi_len; int dbi_err; } dump_bytes_io_t; static void dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; zfs_file_t *fp; caddr_t buf; fp = dbi->dbi_fp; buf = dbi->dbi_buf; dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); } static int dump_bytes(objset_t *os, void *buf, int len, void *arg) { dump_bytes_io_t dbi; dbi.dbi_fp = arg; dbi.dbi_buf = buf; dbi.dbi_len = len; #if defined(HAVE_LARGE_STACKS) dump_bytes_cb(&dbi); #else /* * The vn_rdwr() call is performed in a taskq to ensure that there is * always enough stack space to write safely to the target filesystem. * The ZIO_TYPE_FREE threads are used because there can be a lot of * them and they are used in vdev_file.c for a similar purpose. */ spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); #endif /* HAVE_LARGE_STACKS */ return (dbi.dbi_err); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set * * NOTE: This is no longer the preferred interface, any new functionality * should be added to zfs_ioc_send_new() instead. */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); boolean_t rawok = (zc->zc_flags & 0x8); boolean_t savedok = (zc->zc_flags & 0x10); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate_fast(tosnap, fromsnap, NULL, compressok || rawok, savedok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { zfs_file_t *fp; dmu_send_outparams_t out = {0}; if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) return (SET_ERROR(EBADF)); off = zfs_file_off(fp); out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, savedok, zc->zc_cookie, &off, &out); zfs_file_put(fp); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far * zc_objset_type logical size of data traversed by send thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendstatus_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dss_outfd == zc->zc_cookie && zfs_proc_is_caller(dsp->dss_proc)) break; } if (dsp != NULL) { zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off, 0, 0); /* This is the closest thing we have to atomic_read_64. */ zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0); } else { error = SET_ERROR(ENOENT); } mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &zc->zc_nvlist_dst_size); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); /* * If multihost is enabled, resuming I/O is unsafe as another * host may have imported the pool. */ if (spa_multihost(spa) && spa_suspended(spa)) return (SET_ERROR(EINVAL)); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { error = SET_ERROR(ENODEV); (void) spa_vdev_state_exit(spa, NULL, error); spa_close(spa, FTAG); return (error); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, spa_suspended(spa) ? NULL : spa->spa_root_vdev, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } /* * Reopen all the vdevs associated with the pool. * * innvl: { * "scrub_restart" -> when true and scrub is running, allow to restart * scrub as the side effect of the reopen (boolean). * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; spa_t *spa; int error; boolean_t rc, scrub_restart = B_TRUE; if (innvl) { error = nvlist_lookup_boolean_value(innvl, "scrub_restart", &rc); if (error == 0) scrub_restart = rc; } error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If the scrub_restart flag is B_FALSE and a scrub is already * in progress then set spa_scrub_reopen flag to B_TRUE so that * we don't restart the scrub as a side effect of the reopen. * Otherwise, let vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = (!scrub_restart && dsl_scan_scrubbing(spa->spa_dsl_pool)); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds, *ods; char origin[ZFS_MAX_DATASET_NAME_LEN]; char *cp; int error; zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || strchr(zc->zc_name, '%')) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); if (error != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ods, origin); dsl_dataset_rele(ods, FTAG); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(origin, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(origin, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group|project}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = vmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = xcopyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size); } vmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ dsl_dataset_t *ds, *newds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(ds, &newds, B_TRUE, zfsvfs); error = zfs_resume_fs(zfsvfs, newds); } } if (error == 0) { mutex_enter(&zfsvfs->z_os->os_upgrade_lock); if (zfsvfs->z_os->os_upgrade_id == 0) { /* clear potential error code and retry */ zfsvfs->z_os->os_upgrade_status = 0; mutex_exit(&zfsvfs->z_os->os_upgrade_lock); dsl_pool_config_enter( dmu_objset_pool(zfsvfs->z_os), FTAG); dmu_objset_userspace_upgrade(zfsvfs->z_os); dsl_pool_config_exit( dmu_objset_pool(zfsvfs->z_os), FTAG); } else { mutex_exit(&zfsvfs->z_os->os_upgrade_lock); } taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq, zfsvfs->z_os->os_upgrade_id); error = zfsvfs->z_os->os_upgrade_status; } zfs_vfs_rele(zfsvfs); } else { objset_t *os; /* XXX kind of reading contents without owning */ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_userspace_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); if (dmu_objset_userobjspace_upgradable(os) || dmu_objset_projectquota_upgradable(os)) { mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_id_quota_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; } else { dsl_pool_rele(dmu_objset_pool(os), FTAG); } dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } static int zfs_ioc_share(zfs_cmd_t *zc) { return (SET_ERROR(ENOSYS)); } /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; minor_t minor; zfs_file_t *fp = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (fp == NULL) return (SET_ERROR(EBADF)); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); int error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strlcpy(zc->zc_value, snap_name, sizeof (zc->zc_value)); kmem_strfree(snap_name); kmem_strfree(hold_name); zfs_onexit_fd_rele(fp); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { zfs_file_t *fp; offset_t off; int error; if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) return (SET_ERROR(EBADF)); off = zfs_file_off(fp); error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); zfs_file_put(fp); return (error); } static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { return (SET_ERROR(ENOTSUP)); } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_hold[] = { {"holds", DATA_TYPE_NVLIST, 0}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, }; static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { (void) pool; nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; zfs_file_t *fp = NULL; holds = fnvlist_lookup_nvlist(args, "holds"); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { const char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { fp = zfs_onexit_fd_hold(cleanup_fd, &minor); if (fp == NULL) return (SET_ERROR(EBADF)); } error = dsl_dataset_user_hold(holds, minor, errlist); if (fp != NULL) { ASSERT3U(minor, !=, 0); zfs_onexit_fd_rele(fp); } return (SET_ERROR(error)); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ static const zfs_ioc_key_t zfs_keys_get_holds[] = { /* no nvl keys */ }; static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { (void) args; return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_release[] = { {"...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, }; static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { (void) pool; return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_guid flags (ZEVENT_NONBLOCK) * zc_cleanup_fd zevent file descriptor * * outputs: * zc_nvlist_dst next nvlist event * zc_cookie dropped events since last get */ static int zfs_ioc_events_next(zfs_cmd_t *zc) { zfs_zevent_t *ze; nvlist_t *event = NULL; minor_t minor; uint64_t dropped = 0; int error; zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (fp == NULL) return (SET_ERROR(EBADF)); do { error = zfs_zevent_next(ze, &event, &zc->zc_nvlist_dst_size, &dropped); if (event != NULL) { zc->zc_cookie = dropped; error = put_nvlist(zc, event); nvlist_free(event); } if (zc->zc_guid & ZEVENT_NONBLOCK) break; if ((error == 0) || (error != ENOENT)) break; error = zfs_zevent_wait(ze); if (error != 0) break; } while (1); zfs_zevent_fd_rele(fp); return (error); } /* * outputs: * zc_cookie cleared events count */ static int zfs_ioc_events_clear(zfs_cmd_t *zc) { uint_t count; zfs_zevent_drain_all(&count); zc->zc_cookie = count; return (0); } /* * inputs: * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END * zc_cleanup zevent file descriptor */ static int zfs_ioc_events_seek(zfs_cmd_t *zc) { zfs_zevent_t *ze; minor_t minor; int error; zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (fp == NULL) return (SET_ERROR(EBADF)); error = zfs_zevent_seek(ze, zc->zc_guid); zfs_zevent_fd_rele(fp); return (error); } /* * inputs: * zc_name name of later filesystem or snapshot * zc_value full name of old snapshot or bookmark * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (strchr(zc->zc_value, '#') != NULL) { zfs_bookmark_phys_t bmp; error = dsl_bookmark_lookup(dp, zc->zc_value, new, &bmp); if (error == 0) { error = dsl_dataset_space_written_bookmark(&bmp, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); } } else { dsl_dataset_t *old; error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error == 0) { error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); } } dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static const zfs_ioc_key_t zfs_keys_space_snaps[] = { {"firstsnap", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; const char *firstsnap; uint64_t used, comp, uncomp; firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "savedok" -> (value ignored) * presence indicates we should send a partially received snapshot * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "redactbook" -> (string) * if present, use this bookmark's redaction list to generate a redacted * send stream * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_send_new[] = { {"fd", DATA_TYPE_INT32, 0}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"savedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, }; static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int error; offset_t off; const char *fromname = NULL; int fd; zfs_file_t *fp; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; const char *redactbook = NULL; fd = fnvlist_lookup_int32(innvl, "fd"); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); if ((fp = zfs_file_get(fd)) == NULL) return (SET_ERROR(EBADF)); off = zfs_file_off(fp); dmu_send_outparams_t out = {0}; out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactbook, fd, &off, &out); zfs_file_put(fp); return (error); } static int send_space_sum(objset_t *os, void *buf, int len, void *arg) { (void) os, (void) buf; uint64_t *size = arg; *size += len; return (0); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "fd" -> file descriptor to use as a cookie for progress * tracking (int32) * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static const zfs_ioc_key_t zfs_keys_send_space[] = { {"from", DATA_TYPE_STRING, ZK_OPTIONAL}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; int error; const char *fromname = NULL; const char *redactlist_book = NULL; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t space = 0; boolean_t full_estimate = B_FALSE; uint64_t resumeobj = 0; uint64_t resumeoff = 0; uint64_t resume_bytes = 0; int32_t fd = -1; zfs_bookmark_phys_t zbm = {0}; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } (void) nvlist_lookup_int32(innvl, "fd", &fd); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0); boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook", &redactlist_book) == 0); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes); if (altbook) { full_estimate = B_TRUE; } else if (from) { if (strchr(fromname, '#')) { error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm); /* * dsl_bookmark_lookup() will fail with EXDEV if * the from-bookmark and tosnap are at the same txg. * However, it's valid to do a send (and therefore, * a send estimate) from and to the same time point, * if the bookmark is redacted (the incremental send * can change what's redacted on the target). In * this case, dsl_bookmark_lookup() fills in zbm * but returns EXDEV. Ignore this error. */ if (error == EXDEV && zbm.zbm_redaction_obj != 0 && zbm.zbm_guid == dsl_dataset_phys(tosnap)->ds_guid) error = 0; if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags & ZBM_FLAG_HAS_FBN)) { full_estimate = B_TRUE; } } else if (strchr(fromname, '@')) { error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { full_estimate = B_TRUE; dsl_dataset_rele(fromsnap, FTAG); } } else { /* * from is not properly formatted as a snapshot or * bookmark */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } } if (full_estimate) { dmu_send_outparams_t out = {0}; offset_t off = 0; out.dso_outfunc = send_space_sum; out.dso_arg = &space; out.dso_dryrun = B_TRUE; /* * We have to release these holds so dmu_send can take them. It * will do all the error checking we need. */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactlist_book, fd, &off, &out); } else { error = dmu_send_estimate_fast(tosnap, fromsnap, (from && strchr(fromname, '#') != NULL ? &zbm : NULL), compressok || rawok, savedok, &space); space -= resume_bytes; if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } fnvlist_add_uint64(outnvl, "space", space); return (error); } /* * Sync the currently open TXG to disk for the specified pool. * This is somewhat similar to 'zfs_sync()'. * For cases that do not result in error this ioctl will wait for * the currently open TXG to commit before returning back to the caller. * * innvl: { * "force" -> when true, force uberblock update even if there is no dirty data. * In addition this will cause the vdev configuration to be written * out including updating the zpool cache file. (boolean_t) * } * * onvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_sync[] = { {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, }; static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { (void) onvl; int err; boolean_t rc, force = B_FALSE; spa_t *spa; if ((err = spa_open(pool, &spa, FTAG)) != 0) return (err); if (innvl) { err = nvlist_lookup_boolean_value(innvl, "force", &rc); if (err == 0) force = rc; } if (force) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } txg_wait_synced(spa_get_dsl(spa), 0); spa_close(spa, FTAG); return (0); } /* * Load a user's wrapping key into the kernel. * innvl: { * "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * (optional) "noop" -> (value ignored) * presence indicated key should only be verified, not loaded * } */ static const zfs_ioc_key_t zfs_keys_load_key[] = { {"hidden_args", DATA_TYPE_NVLIST, 0}, {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; static int zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int ret; dsl_crypto_params_t *dcp = NULL; nvlist_t *hidden_args; boolean_t noop = nvlist_exists(innvl, "noop"); if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = SET_ERROR(EINVAL); goto error; } hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS); ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_load_wkey(dsname, dcp, noop); if (ret != 0) goto error; dsl_crypto_params_free(dcp, noop); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } /* * Unload a user's wrapping key from the kernel. * Both innvl and outnvl are unused. */ static const zfs_ioc_key_t zfs_keys_unload_key[] = { /* no nvl keys */ }; static int zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; int ret = 0; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto out; } ret = spa_keystore_unload_wkey(dsname); if (ret != 0) goto out; out: return (ret); } /* * Changes a user's wrapping key used to decrypt a dataset. The keyformat, * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified * here to change how the key is derived in userspace. * * innvl: { * "hidden_args" (optional) -> { "wkeydata" -> value } * raw uint8_t array of new encryption wrapping key data (32 bytes) * "props" (optional) -> { prop -> value } * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_change_key[] = { {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int ret; uint64_t cmd = DCP_CMD_NONE; dsl_crypto_params_t *dcp = NULL; nvlist_t *args = NULL, *hidden_args = NULL; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto error; } (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd); (void) nvlist_lookup_nvlist(innvl, "props", &args); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_change_key(dsname, dcp); if (ret != 0) goto error; dsl_crypto_params_free(dcp, B_FALSE); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; vec->zvec_nvl_keys = nvl_keys; vec->zvec_nvl_key_count = num_keys; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot)); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history)); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps)); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new)); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space)); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_create, ARRAY_SIZE(zfs_keys_create)); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); zfs_ioctl_register("remap", ZFS_IOC_REMAP, zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps)); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold)); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_release, ARRAY_SIZE(zfs_keys_release)); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds)); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback)); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark)); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS, zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props, ARRAY_SIZE(zfs_keys_get_bookmark_props)); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_bookmarks, ARRAY_SIZE(zfs_keys_destroy_bookmarks)); zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW, zfs_ioc_recv_new, zfs_secpolicy_recv, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new)); zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, zfs_ioc_load_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key)); zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, zfs_ioc_unload_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key)); zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, zfs_ioc_change_key, zfs_secpolicy_change_key, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_change_key, ARRAY_SIZE(zfs_keys_change_key)); zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, zfs_ioc_channel_program, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_channel_program, ARRAY_SIZE(zfs_keys_channel_program)); zfs_ioctl_register("redact", ZFS_IOC_REDACT, zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact)); zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint)); zfs_ioctl_register("zpool_discard_checkpoint", ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_discard_checkpoint, ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM, zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim)); zfs_ioctl_register("wait", ZFS_IOC_WAIT, zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait)); zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS, zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); zfs_ioctl_register("zpool_vdev_get_props", ZFS_IOC_VDEV_GET_PROPS, zfs_ioc_vdev_get_props, zfs_secpolicy_read, POOL_NAME, POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_vdev_get_props, ARRAY_SIZE(zfs_keys_vdev_get_props)); zfs_ioctl_register("zpool_vdev_set_props", ZFS_IOC_VDEV_SET_PROPS, zfs_ioc_vdev_set_props, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props)); zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB, zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_NONE, B_TRUE, B_TRUE, zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_init_os(); } /* * Verify that for non-legacy ioctls the input nvlist * pairs match against the expected input. * * Possible errors are: * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair */ static int zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) { const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys; boolean_t required_keys_found = B_FALSE; /* * examine each input pair */ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); data_type_t type = nvpair_type(pair); boolean_t identified = B_FALSE; /* * check pair against the documented names and type */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { /* if not a wild card name, check for an exact match */ if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 && strcmp(nvl_keys[k].zkey_name, name) != 0) continue; identified = B_TRUE; if (nvl_keys[k].zkey_type != DATA_TYPE_ANY && nvl_keys[k].zkey_type != type) { return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE)); } if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; required_keys_found = B_TRUE; break; } /* allow an 'optional' key, everything else is invalid */ if (!identified && (strcmp(name, "optional") != 0 || type != DATA_TYPE_NVLIST)) { return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL)); } } /* verify that all required keys were found */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { /* at least one non-optional key is expected here */ if (!required_keys_found) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); continue; } if (!nvlist_exists(innvl, nvl_keys[k].zkey_name)) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); } return (0); } static int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME || type == ENTITY_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } int zfsdev_getminor(zfs_file_t *fp, minor_t *minorp) { zfsdev_state_t *zs, *fpd; ASSERT(!MUTEX_HELD(&zfsdev_state_lock)); fpd = zfs_file_private(fp); if (fpd == NULL) return (SET_ERROR(EBADF)); mutex_enter(&zfsdev_state_lock); for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) continue; if (fpd == zs) { *minorp = fpd->zs_minor; mutex_exit(&zfsdev_state_lock); return (0); } } mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EBADF)); } void * zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) { zfsdev_state_t *zs; for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == minor) { membar_consumer(); switch (which) { case ZST_ONEXIT: return (zs->zs_onexit); case ZST_ZEVENT: return (zs->zs_zevent); case ZST_ALL: return (zs); } } } return (NULL); } /* * Find a free minor number. The zfsdev_state_list is expected to * be short since it is only a list of currently open file handles. */ static minor_t zfsdev_minor_alloc(void) { static minor_t last_minor = 0; minor_t m; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (zfsdev_get_state(m, ZST_ALL) == NULL) { last_minor = m; return (m); } } return (0); } int zfsdev_state_init(void *priv) { zfsdev_state_t *zs, *zsprev = NULL; minor_t minor; boolean_t newzs = B_FALSE; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); minor = zfsdev_minor_alloc(); if (minor == 0) return (SET_ERROR(ENXIO)); for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) break; zsprev = zs; } if (!zs) { zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); newzs = B_TRUE; } zfsdev_private_set_state(priv, zs); zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); /* * In order to provide for lock-free concurrent read access * to the minor list in zfsdev_get_state(), new entries * must be completely written before linking them into the * list whereas existing entries are already linked; the last * operation must be updating zs_minor (from -1 to the new * value). */ if (newzs) { zs->zs_minor = minor; membar_producer(); zsprev->zs_next = zs; } else { membar_producer(); zs->zs_minor = minor; } return (0); } void zfsdev_state_destroy(void *priv) { zfsdev_state_t *zs = zfsdev_private_get_state(priv); ASSERT(zs != NULL); ASSERT3S(zs->zs_minor, >, 0); /* * The last reference to this zfsdev file descriptor is being dropped. * We don't have to worry about lookup grabbing this state object, and * zfsdev_state_init() will not try to reuse this object until it is * invalidated by setting zs_minor to -1. Invalidation must be done * last, with a memory barrier to ensure ordering. This lets us avoid * taking the global zfsdev state lock around destruction. */ zfs_onexit_destroy(zs->zs_onexit); zfs_zevent_destroy(zs->zs_zevent); zs->zs_onexit = NULL; zs->zs_zevent = NULL; membar_producer(); zs->zs_minor = -1; } long zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) { int error, cmd; const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; uint64_t max_nvlist_src_size; size_t saved_poolname_len = 0; nvlist_t *innvl = NULL; fstrans_cookie_t cookie; hrtime_t start_time = gethrtime(); cmd = vecnum; error = 0; if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); vec = &zfs_ioc_vec[vecnum]; /* * The registered ioctl list may be sparse, verify that either * a normal or legacy handler are registered. */ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); zc->zc_iflags = flag & FKIOCTL; max_nvlist_src_size = zfs_max_nvlist_src_size_os(); if (zc->zc_nvlist_src_size > max_nvlist_src_size) { /* * Make sure the user doesn't pass in an insane value for * zc_nvlist_src_size. We have to check, since we will end * up allocating that much memory inside of get_nvlist(). This * prevents a nefarious user from allocating tons of kernel * memory. * * Also, we return EINVAL instead of ENOMEM here. The reason * being that returning ENOMEM from an ioctl() has a special * connotation; that the user's size value is too small and * needs to be expanded to hold the nvlist. See * zcmd_expand_dst_nvlist() for details. */ error = SET_ERROR(EINVAL); /* User's size too big */ } else if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case ENTITY_NAME: if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { error = SET_ERROR(EINVAL); } else { error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); } break; case NO_NAME: break; } /* * Ensure that all input pairs are valid before we pass them down * to the lower layers. * * The vectored functions can use fnvlist_lookup_{type} for any * required pairs since zfs_check_input_nvpairs() confirmed that * they exist and are of the correct type. */ if (error == 0 && vec->zvec_func != NULL) { error = zfs_check_input_nvpairs(innvl, vec); if (error != 0) goto out; } if (error == 0) { cookie = spl_fstrans_mark(); error = vec->zvec_secpolicy(zc, innvl, CRED()); spl_fstrans_unmark(cookie); } if (error != 0) goto out; /* legacy ioctls can modify zc_name */ /* * Can't use kmem_strdup() as we might truncate the string and * kmem_strfree() would then free with incorrect size. */ saved_poolname_len = strlen(zc->zc_name) + 1; saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP); strlcpy(saved_poolname, zc->zc_name, saved_poolname_len); saved_poolname[strcspn(saved_poolname, "/@#")] = '\0'; if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); cookie = spl_fstrans_mark(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); spl_fstrans_unmark(cookie); /* * Some commands can partially execute, modify state, and still * return an error. In these cases, attempt to record what * was modified. */ if ((error == 0 || (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { size_t out_size = fnvlist_size(outnvl); if (out_size > zfs_history_output_max) { fnvlist_add_int64(lognv, ZPOOL_HIST_OUTPUT_SIZE, out_size); } else { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } fnvlist_add_int64(lognv, ZPOOL_HIST_ELAPSED_NS, gethrtime() - start_time); (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { cookie = spl_fstrans_mark(); error = vec->zvec_legacy_func(zc); spl_fstrans_unmark(cookie); } out: nvlist_free(innvl); if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) kmem_strfree(s); (void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname)); } if (saved_poolname != NULL) kmem_free(saved_poolname, saved_poolname_len); return (error); } int zfs_kmod_init(void) { int error; if ((error = zvol_init()) != 0) return (error); spa_init(SPA_MODE_READ | SPA_MODE_WRITE); zfs_init(); zfs_ioctl_init(); mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); zfsdev_state_listhead.zs_minor = -1; if ((error = zfsdev_attach()) != 0) goto out; tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); return (0); out: zfs_fini(); spa_fini(); zvol_fini(); return (error); } void zfs_kmod_fini(void) { zfsdev_state_t *zs, *zsnext = NULL; zfsdev_detach(); mutex_destroy(&zfsdev_state_lock); for (zs = &zfsdev_state_listhead; zs != NULL; zs = zsnext) { zsnext = zs->zs_next; if (zs->zs_onexit) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); if (zs != &zfsdev_state_listhead) kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); } ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, U64, ZMOD_RW, "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, U64, ZMOD_RW, "Maximum size in bytes of ZFS ioctl output that will be logged"); diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index 18c6cbf028b3..a11886136994 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -1,4233 +1,4233 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2018 Datto Inc. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system * calls that change the file system. Each itx has enough information to * be able to replay them after a system crash, power loss, or * equivalent failure mode. These are stored in memory until either: * * 1. they are committed to the pool by the DMU transaction group * (txg), at which point they can be discarded; or * 2. they are committed to the on-disk ZIL for the dataset being * modified (e.g. due to an fsync, O_DSYNC, or other synchronous * requirement). * * In the event of a crash or power loss, the itxs contained by each * dataset's on-disk ZIL will be replayed when that dataset is first * instantiated (e.g. if the dataset is a normal filesystem, when it is * first mounted). * * As hinted at above, there is one ZIL per dataset (both the in-memory * representation, and the on-disk representation). The on-disk format * consists of 3 parts: * * - a single, per-dataset, ZIL header; which points to a chain of * - zero or more ZIL blocks; each of which contains * - zero or more ZIL records * * A ZIL record holds the information necessary to replay a single * system call transaction. A ZIL block can hold many ZIL records, and * the blocks are chained together, similarly to a singly linked list. * * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL * block in the chain, and the ZIL header points to the first block in * the chain. * * Note, there is not a fixed place in the pool to hold these ZIL * blocks; they are dynamically allocated and freed as needed from the * blocks available on the pool, though they can be preferentially * allocated from a dedicated "log" vdev. */ /* * This controls the amount of time that a ZIL block (lwb) will remain * "open" when it isn't "full", and it has a thread waiting for it to be * committed to stable storage. Please refer to the zil_commit_waiter() * function (and the comments within it) for more details. */ static uint_t zfs_commit_timeout_pct = 5; /* * Minimal time we care to delay commit waiting for more ZIL records. * At least FreeBSD kernel can't sleep for less than 2us at its best. * So requests to sleep for less then 5us is a waste of CPU time with * a risk of significant log latency increase due to oversleep. */ static uint64_t zil_min_commit_timeout = 5000; /* * See zil.h for more information about these fields. */ static zil_kstat_values_t zil_stats = { { "zil_commit_count", KSTAT_DATA_UINT64 }, { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, { "zil_itx_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_copied_count", KSTAT_DATA_UINT64 }, { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 }, { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }, }; static zil_sums_t zil_sums_global; static kstat_t *zil_kstats_global; /* * Disable intent logging replay. This global ZIL switch affects all pools. */ int zil_replay_disable = 0; /* * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to * the disk(s) by the ZIL after an LWB write has completed. Setting this * will cause ZIL corruption on power loss if a volatile out-of-order * write cache is enabled. */ static int zil_nocacheflush = 0; /* * Limit SLOG write size per commit executed with synchronous priority. * Any writes above that will be executed with lower (asynchronous) priority * to limit potential SLOG device abuse by single active ZIL writer. */ -static uint64_t zil_slog_bulk = 768 * 1024; +static uint64_t zil_slog_bulk = 64 * 1024 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); static itx_t *zil_itx_clone(itx_t *oitx); static int zil_bp_compare(const void *x1, const void *x2) { const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); if (likely(cmp)) return (cmp); return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); } static void zil_bp_tree_init(zilog_t *zilog) { avl_create(&zilog->zl_bp_tree, zil_bp_compare, sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); } static void zil_bp_tree_fini(zilog_t *zilog) { avl_tree_t *t = &zilog->zl_bp_tree; zil_bp_node_t *zn; void *cookie = NULL; while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zn, sizeof (zil_bp_node_t)); avl_destroy(t); } int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_bp_tree; const dva_t *dva; zil_bp_node_t *zn; avl_index_t where; if (BP_IS_EMBEDDED(bp)) return (0); dva = BP_IDENTITY(bp); if (avl_find(t, dva, &where) != NULL) return (SET_ERROR(EEXIST)); zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); zn->zn_dva = *dva; avl_insert(t, zn, where); return (0); } static zil_header_t * zil_header_in_syncing_context(zilog_t *zilog) { return ((zil_header_t *)zilog->zl_header); } static void zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) { zio_cksum_t *zc = &bp->blk_cksum; (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0], sizeof (zc->zc_word[ZIL_ZC_GUID_0])); (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1], sizeof (zc->zc_word[ZIL_ZC_GUID_1])); zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); zc->zc_word[ZIL_ZC_SEQ] = 1ULL; } static int zil_kstats_global_update(kstat_t *ksp, int rw) { zil_kstat_values_t *zs = ksp->ks_data; ASSERT3P(&zil_stats, ==, zs); if (rw == KSTAT_WRITE) { return (SET_ERROR(EACCES)); } zil_kstat_values_update(zs, &zil_sums_global); return (0); } /* * Read a log block and make sure it's valid. */ static int zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; int error; if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) zio_flags |= ZIO_FLAG_SPECULATIVE; if (!decrypt) zio_flags |= ZIO_FLAG_RAW; SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; /* * Validate the checksummed log block. * * Sequence numbers should be... sequential. The checksum * verifier for the next block should be bp's checksum plus 1. * * Also check the log chain linkage and size used. */ cksum.zc_word[ZIL_ZC_SEQ]++; uint64_t size = BP_GET_LSIZE(bp); if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = (*abuf)->b_data; char *lr = (char *)(zilc + 1); if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || zilc->zc_nused < sizeof (*zilc) || zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); } else { *begin = lr; *end = lr + zilc->zc_nused - sizeof (*zilc); *nbp = zilc->zc_next_blk; } } else { char *lr = (*abuf)->b_data; zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { *begin = lr; *end = lr + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } } return (error); } /* * Read a TX_WRITE log data block. */ static int zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; if (BP_IS_HOLE(bp)) { if (wbuf != NULL) memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length)); return (0); } if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; /* * If we are not using the resulting data, we are just checking that * it hasn't been corrupted so we don't need to waste CPU time * decompressing and decrypting it. */ if (wbuf == NULL) zio_flags |= ZIO_FLAG_RAW; ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { if (wbuf != NULL) memcpy(wbuf, abuf->b_data, arc_buf_size(abuf)); arc_buf_destroy(abuf, &abuf); } return (error); } void zil_sums_init(zil_sums_t *zs) { wmsum_init(&zs->zil_commit_count, 0); wmsum_init(&zs->zil_commit_writer_count, 0); wmsum_init(&zs->zil_itx_count, 0); wmsum_init(&zs->zil_itx_indirect_count, 0); wmsum_init(&zs->zil_itx_indirect_bytes, 0); wmsum_init(&zs->zil_itx_copied_count, 0); wmsum_init(&zs->zil_itx_copied_bytes, 0); wmsum_init(&zs->zil_itx_needcopy_count, 0); wmsum_init(&zs->zil_itx_needcopy_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_count, 0); wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_write, 0); wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0); wmsum_init(&zs->zil_itx_metaslab_slog_count, 0); wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_slog_write, 0); wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0); } void zil_sums_fini(zil_sums_t *zs) { wmsum_fini(&zs->zil_commit_count); wmsum_fini(&zs->zil_commit_writer_count); wmsum_fini(&zs->zil_itx_count); wmsum_fini(&zs->zil_itx_indirect_count); wmsum_fini(&zs->zil_itx_indirect_bytes); wmsum_fini(&zs->zil_itx_copied_count); wmsum_fini(&zs->zil_itx_copied_bytes); wmsum_fini(&zs->zil_itx_needcopy_count); wmsum_fini(&zs->zil_itx_needcopy_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_count); wmsum_fini(&zs->zil_itx_metaslab_normal_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_write); wmsum_fini(&zs->zil_itx_metaslab_normal_alloc); wmsum_fini(&zs->zil_itx_metaslab_slog_count); wmsum_fini(&zs->zil_itx_metaslab_slog_bytes); wmsum_fini(&zs->zil_itx_metaslab_slog_write); wmsum_fini(&zs->zil_itx_metaslab_slog_alloc); } void zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums) { zs->zil_commit_count.value.ui64 = wmsum_value(&zil_sums->zil_commit_count); zs->zil_commit_writer_count.value.ui64 = wmsum_value(&zil_sums->zil_commit_writer_count); zs->zil_itx_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_count); zs->zil_itx_indirect_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_indirect_count); zs->zil_itx_indirect_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_indirect_bytes); zs->zil_itx_copied_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_copied_count); zs->zil_itx_copied_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_copied_bytes); zs->zil_itx_needcopy_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_needcopy_count); zs->zil_itx_needcopy_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_needcopy_bytes); zs->zil_itx_metaslab_normal_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_count); zs->zil_itx_metaslab_normal_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes); zs->zil_itx_metaslab_normal_write.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_write); zs->zil_itx_metaslab_normal_alloc.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc); zs->zil_itx_metaslab_slog_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_count); zs->zil_itx_metaslab_slog_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes); zs->zil_itx_metaslab_slog_write.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_write); zs->zil_itx_metaslab_slog_alloc.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc); } /* * Parse the intent log, and call parse_func for each valid record within. */ int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg, boolean_t decrypt) { const zil_header_t *zh = zilog->zl_header; boolean_t claimed = !!zh->zh_claim_txg; uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; uint64_t max_blk_seq = 0; uint64_t max_lr_seq = 0; uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk = {{{{0}}}}; int error = 0; /* * Old logs didn't record the maximum zh_claim_lr_seq. */ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) claim_lr_seq = UINT64_MAX; /* * Starting at the block pointed to by zh_log we read the log chain. * For each block in the chain we strongly check that block to * ensure its validity. We stop when an invalid block is found. * For each block pointer in the chain we call parse_blk_func(). * For each record in each valid block we call parse_lr_func(). * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; char *lrp, *end; arc_buf_t *abuf = NULL; if (blk_seq > claim_blk_seq) break; error = parse_blk_func(zilog, &blk, arg, txg); if (error != 0) break; ASSERT3U(max_blk_seq, <, blk_seq); max_blk_seq = blk_seq; blk_count++; if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) break; error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, &lrp, &end, &abuf); if (error != 0) { if (abuf) arc_buf_destroy(abuf, &abuf); if (claimed) { char name[ZFS_MAX_DATASET_NAME_LEN]; dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS read log block error %d, " "dataset %s, seq 0x%llx\n", error, name, (u_longlong_t)blk_seq); } break; } for (; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); if (lr->lrc_seq > claim_lr_seq) { arc_buf_destroy(abuf, &abuf); goto done; } error = parse_lr_func(zilog, lr, arg, txg); if (error != 0) { arc_buf_destroy(abuf, &abuf); goto done; } ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; } arc_buf_destroy(abuf, &abuf); } done: zilog->zl_parse_error = error; zilog->zl_parse_blk_seq = max_blk_seq; zilog->zl_parse_lr_seq = max_lr_seq; zilog->zl_parse_blk_count = blk_count; zilog->zl_parse_lr_count = lr_count; zil_bp_tree_fini(zilog); return (error); } static int zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t first_txg) { (void) tx; ASSERT(!BP_IS_HOLE(bp)); /* * As we call this function from the context of a rewind to a * checkpoint, each ZIL block whose txg is later than the txg * that we rewind to is invalid. Thus, we return -1 so * zil_parse() doesn't attempt to read it. */ if (bp->blk_birth >= first_txg) return (-1); if (zil_bp_tree_add(zilog, bp) != 0) return (0); zio_free(zilog->zl_spa, first_txg, bp); return (0); } static int zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { (void) zilog, (void) lrc, (void) tx, (void) first_txg; return (0); } static int zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t first_txg) { /* * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); return (zio_wait(zio_claim(NULL, zilog->zl_spa, tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); } static int zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { lr_write_t *lr = (lr_write_t *)lrc; int error; ASSERT(lrc->lrc_txtype == TX_WRITE); /* * If the block is not readable, don't claim it. This can happen * in normal operation when a log block is written to disk before * some of the dmu_sync() blocks it points to. In this case, the * transaction cannot have been committed to anyone (we would have * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ if (lr->lr_blkptr.blk_birth >= first_txg) { error = zil_read_log_data(zilog, lr, NULL); if (error != 0) return (error); } return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); } static int zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; spa_t *spa; uint_t ii; ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); if (tx == NULL) { return (0); } /* * XXX: Do we need to byteswap lr? */ spa = zilog->zl_spa; for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; /* * When data in embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as * it contains the data. */ if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { brt_pending_add(spa, bp, tx); } } return (0); } static int zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { switch (lrc->lrc_txtype) { case TX_WRITE: return (zil_claim_write(zilog, lrc, tx, first_txg)); case TX_CLONE_RANGE: return (zil_claim_clone_range(zilog, lrc, tx)); default: return (0); } } static int zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t claim_txg) { (void) claim_txg; zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static int zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; ASSERT(lrc->lrc_txtype == TX_WRITE); /* * If we previously claimed it, we need to free it. */ if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) { zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); } return (0); } static int zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; spa_t *spa; uint_t ii; ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); if (tx == NULL) { return (0); } spa = zilog->zl_spa; for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; if (!BP_IS_HOLE(bp)) { zio_free(spa, dmu_tx_get_txg(tx), bp); } } return (0); } static int zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) { if (claim_txg == 0) { return (0); } switch (lrc->lrc_txtype) { case TX_WRITE: return (zil_free_write(zilog, lrc, tx, claim_txg)); case TX_CLONE_RANGE: return (zil_free_clone_range(zilog, lrc, tx)); default: return (0); } } static int zil_lwb_vdev_compare(const void *x1, const void *x2) { const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; return (TREE_CMP(v1, v2)); } /* * Allocate a new lwb. We may already have a block pointer for it, in which * case we get size and version from there. Or we may not yet, in which case * we choose them here and later make the block allocation match. */ static lwb_t * zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, uint64_t txg, lwb_state_t state) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; if (bp) { lwb->lwb_blk = *bp; lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2); sz = BP_GET_LSIZE(bp); } else { BP_ZERO(&lwb->lwb_blk); lwb->lwb_slim = (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL); } lwb->lwb_slog = slog; lwb->lwb_error = 0; if (lwb->lwb_slim) { lwb->lwb_nmax = sz; lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); } else { lwb->lwb_nmax = sz - sizeof (zil_chain_t); lwb->lwb_nused = lwb->lwb_nfilled = 0; } lwb->lwb_sz = sz; lwb->lwb_state = state; lwb->lwb_buf = zio_buf_alloc(sz); lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; lwb->lwb_alloc_txg = txg; lwb->lwb_max_txg = 0; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); if (state != LWB_STATE_NEW) zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); return (lwb); } static void zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_FLUSH_DONE); ASSERT3P(lwb->lwb_child_zio, ==, NULL); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); VERIFY(list_is_empty(&lwb->lwb_itxs)); VERIFY(list_is_empty(&lwb->lwb_waiters)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); /* * Clear the zilog's field to indicate this lwb is no longer * valid, and prevent use-after-free errors. */ if (zilog->zl_last_lwb_opened == lwb) zilog->zl_last_lwb_opened = NULL; kmem_cache_free(zil_lwb_cache, lwb); } /* * Called when we create in-memory log transactions so that we know * to cleanup the itxs at the end of spa_sync(). */ static void zilog_dirty(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); ASSERT(spa_writeable(zilog->zl_spa)); if (ds->ds_is_snapshot) panic("dirtying snapshot!"); if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, zilog); zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); } } /* * Determine if the zil is dirty in the specified txg. Callers wanting to * ensure that the dirty state does not change must hold the itxg_lock for * the specified txg. Holding the lock will ensure that the zil cannot be * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current * state. */ static boolean_t __maybe_unused zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) return (B_TRUE); return (B_FALSE); } /* * Determine if the zil is dirty. The zil is considered dirty if it has * any pending itx records that have not been cleaned by zil_clean(). */ static boolean_t zilog_is_dirty(zilog_t *zilog) { dsl_pool_t *dp = zilog->zl_dmu_pool; for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) return (B_TRUE); } return (B_FALSE); } /* * Its called in zil_commit context (zil_process_commit_list()/zil_create()). * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled. * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every * zil_commit. */ static void zil_commit_activate_saxattr_feature(zilog_t *zilog) { dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); uint64_t txg = 0; dmu_tx_t *tx = NULL; if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL && !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) { tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(ds, tx); txg = dmu_tx_get_txg(tx); mutex_enter(&ds->ds_lock); ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = (void *)B_TRUE; mutex_exit(&ds->ds_lock); dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } } /* * Create an on-disk intent log. */ static lwb_t * zil_create(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb = NULL; uint64_t txg = 0; dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; boolean_t slog = FALSE; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ASSERT(zh->zh_claim_txg == 0); ASSERT(zh->zh_replay_seq == 0); blk = zh->zh_log; /* * Allocate an initial log block if: * - there isn't one already * - the existing block is the wrong endianness */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); if (!BP_IS_HOLE(&blk)) { zio_free(zilog->zl_spa, txg, &blk); BP_ZERO(&blk); } error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, ZIL_MIN_BLKSZ, &slog); if (error == 0) zil_init_log_chain(zilog, &blk); } /* * Allocate a log write block (lwb) for the first log block. */ if (error == 0) lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW); /* * If we just allocated the first log block, commit our transaction * and wait for zil_sync() to stuff the block pointer into zh_log. * (zh is part of the MOS, so we cannot modify it in open context.) */ if (tx != NULL) { /* * If "zilsaxattr" feature is enabled on zpool, then activate * it now when we're creating the ZIL chain. We can't wait with * this until we write the first xattr log record because we * need to wait for the feature activation to sync out. */ if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL) { mutex_enter(&ds->ds_lock); ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = (void *)B_TRUE; mutex_exit(&ds->ds_lock); } dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } else { /* * This branch covers the case where we enable the feature on a * zpool that has existing ZIL headers. */ zil_commit_activate_saxattr_feature(zilog); } IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL, dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)); ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); IMPLY(error == 0, lwb != NULL); return (lwb); } /* * In one tx, free all log blocks and clear the log header. If keep_first * is set, then we're replaying a log with no content. We want to keep the * first block, however, so that the first synchronous transaction doesn't * require a txg_wait_synced() in zil_create(). We don't need to * txg_wait_synced() here either when keep_first is set, because both * zil_create() and zil_destroy() will wait for any in-progress destroys * to complete. * Return B_TRUE if there were any entries to replay. */ boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_old_header = *zh; /* debugging aid */ if (BP_IS_HOLE(&zh->zh_log)) return (B_FALSE); tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); mutex_enter(&zilog->zl_lock); ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; zilog->zl_keep_first = keep_first; if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); if (!BP_IS_HOLE(&lwb->lwb_blk)) zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); } } else if (!keep_first) { zil_destroy_sync(zilog, tx); } mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); return (B_TRUE); } void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE); } int zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) { dmu_tx_t *tx = txarg; zilog_t *zilog; uint64_t first_txg; zil_header_t *zh; objset_t *os; int error; error = dmu_objset_own_obj(dp, ds->ds_object, DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os); if (error != 0) { /* * EBUSY indicates that the objset is inconsistent, in which * case it can not have a ZIL. */ if (error != EBUSY) { cmn_err(CE_WARN, "can't open objset for %llu, error %u", (unsigned long long)ds->ds_object, error); } return (0); } zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); first_txg = spa_min_claim_txg(zilog->zl_spa); /* * If the spa_log_state is not set to be cleared, check whether * the current uberblock is a checkpoint one and if the current * header has been claimed before moving on. * * If the current uberblock is a checkpointed uberblock then * one of the following scenarios took place: * * 1] We are currently rewinding to the checkpoint of the pool. * 2] We crashed in the middle of a checkpoint rewind but we * did manage to write the checkpointed uberblock to the * vdev labels, so when we tried to import the pool again * the checkpointed uberblock was selected from the import * procedure. * * In both cases we want to zero out all the ZIL blocks, except * the ones that have been claimed at the time of the checkpoint * (their zh_claim_txg != 0). The reason is that these blocks * may be corrupted since we may have reused their locations on * disk after we took the checkpoint. * * We could try to set spa_log_state to SPA_LOG_CLEAR earlier * when we first figure out whether the current uberblock is * checkpointed or not. Unfortunately, that would discard all * the logs, including the ones that are claimed, and we would * leak space. */ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && zh->zh_claim_txg == 0)) { if (!BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_clear_log_block, zil_noop_log_record, tx, first_txg, B_FALSE); } BP_ZERO(&zh->zh_log); if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; dsl_dataset_dirty(dmu_objset_ds(os), tx); dmu_objset_disown(os, B_FALSE, FTAG); return (0); } /* * If we are not rewinding and opening the pool normally, then * the min_claim_txg should be equal to the first txg of the pool. */ ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can * read only part of the log now (e.g. due to a missing device), * but we can read the entire log later, we will not try to replay * or destroy beyond the last block we successfully claimed. */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, first_txg, B_FALSE); zh->zh_claim_txg = first_txg; zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) zh->zh_flags |= ZIL_REPLAY_NEEDED; zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; dsl_dataset_dirty(dmu_objset_ds(os), tx); } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); dmu_objset_disown(os, B_FALSE, FTAG); return (0); } /* * Check the log by walking the log chain. * Checksum errors are ok as they indicate the end of the chain. * Any other error (no device or read failure) returns an error. */ int zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) { (void) dp; zilog_t *zilog; objset_t *os; blkptr_t *bp; int error; ASSERT(tx == NULL); error = dmu_objset_from_ds(ds, &os); if (error != 0) { cmn_err(CE_WARN, "can't open objset %llu, error %d", (unsigned long long)ds->ds_object, error); return (0); } zilog = dmu_objset_zil(os); bp = (blkptr_t *)&zilog->zl_header->zh_log; if (!BP_IS_HOLE(bp)) { vdev_t *vd; boolean_t valid = B_TRUE; /* * Check the first block and determine if it's on a log device * which may have been removed or faulted prior to loading this * pool. If so, there's no point in checking the rest of the * log as its content should have already been synced to the * pool. */ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); if (vd->vdev_islog && vdev_is_dead(vd)) valid = vdev_log_state_valid(vd); spa_config_exit(os->os_spa, SCL_STATE, FTAG); if (!valid) return (0); /* * Check whether the current uberblock is checkpointed (e.g. * we are rewinding) and whether the current header has been * claimed or not. If it hasn't then skip verifying it. We * do this because its ZIL blocks may be part of the pool's * state before the rewind, which is no longer valid. */ zil_header_t *zh = zil_header_in_syncing_context(zilog); if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && zh->zh_claim_txg == 0) return (0); } /* * Because tx == NULL, zil_claim_log_block() will not actually claim * any blocks, but just determine whether it is possible to do so. * In addition to checking the log chain, zil_claim_log_block() * will invoke zio_claim() with a done func of spa_claim_notify(), * which will update spa_max_claim_txg. See spa_load() for details. */ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, zilog->zl_header->zh_claim_txg ? -1ULL : spa_min_claim_txg(os->os_spa), B_FALSE); return ((error == ECKSUM || error == ENOENT) ? 0 : error); } /* * When an itx is "skipped", this function is used to properly mark the * waiter as "done, and signal any thread(s) waiting on it. An itx can * be skipped (and not committed to an lwb) for a variety of reasons, * one of them being that the itx was committed via spa_sync(), prior to * it being committed to an lwb; this can happen if a thread calling * zil_commit() is racing with spa_sync(). */ static void zil_commit_waiter_skip(zil_commit_waiter_t *zcw) { mutex_enter(&zcw->zcw_lock); ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); } /* * This function is used when the given waiter is to be linked into an * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. * At this point, the waiter will no longer be referenced by the itx, * and instead, will be referenced by the lwb. */ static void zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { /* * The lwb_waiters field of the lwb is protected by the zilog's * zl_issuer_lock while the lwb is open and zl_lock otherwise. * zl_issuer_lock also protects leaving the open state. * zcw_lwb setting is protected by zl_issuer_lock and state != * flush_done, which transition is protected by zl_lock. */ ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock)); IMPLY(lwb->lwb_state != LWB_STATE_OPENED, MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(&lwb->lwb_waiters, zcw); ASSERT3P(zcw->zcw_lwb, ==, NULL); zcw->zcw_lwb = lwb; } /* * This function is used when zio_alloc_zil() fails to allocate a ZIL * block, and the given waiter must be linked to the "nolwb waiters" * list inside of zil_process_commit_list(). */ static void zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) { ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(nolwb, zcw); ASSERT3P(zcw->zcw_lwb, ==, NULL); } void zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) { avl_tree_t *t = &lwb->lwb_vdev_tree; avl_index_t where; zil_vdev_node_t *zv, zvsearch; int ndvas = BP_GET_NDVAS(bp); int i; ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); if (zil_nocacheflush) return; mutex_enter(&lwb->lwb_vdev_lock); for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { zv = kmem_alloc(sizeof (*zv), KM_SLEEP); zv->zv_vdev = zvsearch.zv_vdev; avl_insert(t, zv, where); } } mutex_exit(&lwb->lwb_vdev_lock); } static void zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) { avl_tree_t *src = &lwb->lwb_vdev_tree; avl_tree_t *dst = &nlwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); /* * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does * not need the protection of lwb_vdev_lock (it will only be modified * while holding zilog->zl_lock) as its writes and those of its * children have all completed. The younger 'nlwb' may be waiting on * future writes to additional vdevs. */ mutex_enter(&nlwb->lwb_vdev_lock); /* * Tear down the 'lwb' vdev tree, ensuring that entries which do not * exist in 'nlwb' are moved to it, freeing any would-be duplicates. */ while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { avl_index_t where; if (avl_find(dst, zv, &where) == NULL) { avl_insert(dst, zv, where); } else { kmem_free(zv, sizeof (*zv)); } } mutex_exit(&nlwb->lwb_vdev_lock); } void zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) { lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); } /* * This function is a called after all vdevs associated with a given lwb * write have completed their DKIOCFLUSHWRITECACHE command; or as soon * as the lwb write completes, if "zil_nocacheflush" is set. Further, * all "previous" lwb's will have completed before this function is * called; i.e. this function is called for all previous lwbs before * it's called for "this" lwb (enforced via zio the dependencies * configured in zil_lwb_set_zio_dependency()). * * The intention is for this function to be called as soon as the * contents of an lwb are considered "stable" on disk, and will survive * any sudden loss of power. At this point, any threads waiting for the * lwb to reach this state are signalled, and the "waiter" structures * are marked "done". */ static void zil_lwb_flush_vdevs_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; zil_commit_waiter_t *zcw; itx_t *itx; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8; lwb->lwb_root_zio = NULL; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); lwb->lwb_state = LWB_STATE_FLUSH_DONE; if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number * for ztest. We only update this value when all the log * writes succeeded, because ztest wants to ASSERT that * it got the whole log chain. */ zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been * propagated "up" to this specific LWB's root ZIO, in * order for this error handling to work correctly. This * includes ZIO errors from either this LWB's write or * flush, as well as any errors from other dependent LWBs * (e.g. a root LWB ZIO that might be a child of this LWB). * * With that said, it's important to note that LWB flush * errors are not propagated up to the LWB root ZIO. * This is incorrect behavior, and results in VDEV flush * errors not being handled correctly here. See the * comment above the call to "zio_flush" for details. */ zcw->zcw_zio_error = zio->io_error; ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); } uint64_t txg = lwb->lwb_issued_txg; /* Once we drop the lock, lwb may be freed by zil_sync(). */ mutex_exit(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); zilog->zl_lwb_inflight[txg & TXG_MASK]--; if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0) cv_broadcast(&zilog->zl_lwb_io_cv); mutex_exit(&zilog->zl_lwb_io_lock); } /* * Wait for the completion of all issued write/flush of that txg provided. * It guarantees zil_lwb_flush_vdevs_done() is called and returned. */ static void zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) { ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa)); mutex_enter(&zilog->zl_lwb_io_lock); while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0) cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock); mutex_exit(&zilog->zl_lwb_io_lock); #ifdef ZFS_DEBUG mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); lwb_t *lwb = list_head(&zilog->zl_lwb_list); while (lwb != NULL) { if (lwb->lwb_issued_txg <= txg) { ASSERT(lwb->lwb_state != LWB_STATE_ISSUED); ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE); IMPLY(lwb->lwb_issued_txg > 0, lwb->lwb_state == LWB_STATE_FLUSH_DONE); } IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE, lwb->lwb_buf == NULL); lwb = list_next(&zilog->zl_lwb_list, lwb); } mutex_exit(&zilog->zl_lwb_io_lock); mutex_exit(&zilog->zl_lock); #endif } /* * This is called when an lwb's write zio completes. The callback's * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved * in writing out this specific lwb's data, and in the case that cache * flushes have been deferred, vdevs involved in writing the data for * previous lwbs. The writes corresponding to all the vdevs in the * lwb_vdev_tree will have completed by the time this is called, due to * the zio dependencies configured in zil_lwb_set_zio_dependency(), * which takes deferred flushes into account. The lwb will be "done" * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio * completion callback for the lwb's root zio. */ static void zil_lwb_write_done(zio_t *zio) { lwb_t *lwb = zio->io_private; spa_t *spa = zio->io_spa; zilog_t *zilog = lwb->lwb_zilog; avl_tree_t *t = &lwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; lwb_t *nlwb; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); abd_free(zio->io_abd); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); lwb->lwb_buf = NULL; mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); lwb->lwb_state = LWB_STATE_WRITE_DONE; lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; /* * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not * called for it yet, and when it will be, it won't be able to make * its write ZIO a parent this ZIO. In such case we can not defer * our flushes or below may be a race between the done callbacks. */ nlwb = list_next(&zilog->zl_lwb_list, lwb); if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED) nlwb = NULL; mutex_exit(&zilog->zl_lock); if (avl_numnodes(t) == 0) return; /* * If there was an IO error, we're not going to call zio_flush() * on these vdevs, so we simply empty the tree and free the * nodes. We avoid calling zio_flush() since there isn't any * good reason for doing so, after the lwb block failed to be * written out. * * Additionally, we don't perform any further error handling at * this point (e.g. setting "zcw_zio_error" appropriately), as * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus, * we expect any error seen here, to have been propagated to * that function). */ if (zio->io_error != 0) { while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zv, sizeof (*zv)); return; } /* * If this lwb does not have any threads waiting for it to * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE * command to the vdevs written to by "this" lwb, and instead * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE * command for those vdevs. Thus, we merge the vdev tree of * "this" lwb with the vdev tree of the "next" lwb in the list, * and assume the "next" lwb will handle flushing the vdevs (or * deferring the flush(s) again). * * This is a useful performance optimization, especially for * workloads with lots of async write activity and few sync * write and/or fsync activity, as it has the potential to * coalesce multiple flush commands to a vdev into one. */ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { zil_lwb_flush_defer(lwb, nlwb); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); return; } while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); if (vd != NULL && !vd->vdev_nowritecache) { /* * The "ZIO_FLAG_DONT_PROPAGATE" is currently * always used within "zio_flush". This means, * any errors when flushing the vdev(s), will * (unfortunately) not be handled correctly, * since these "zio_flush" errors will not be * propagated up to "zil_lwb_flush_vdevs_done". */ zio_flush(lwb->lwb_root_zio, vd); } kmem_free(zv, sizeof (*zv)); } } /* * Build the zio dependency chain, which is used to preserve the ordering of * lwb completions that is required by the semantics of the ZIL. Each new lwb * zio becomes a parent of the previous lwb zio, such that the new lwb's zio * cannot complete until the previous lwb's zio completes. * * This is required by the semantics of zil_commit(): the commit waiters * attached to the lwbs will be woken in the lwb zio's completion callback, * so this zio dependency graph ensures the waiters are woken in the correct * order (the same order the lwbs were created). */ static void zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb); if (prev_lwb == NULL || prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE) return; /* * If the previous lwb's write hasn't already completed, we also want * to order the completion of the lwb write zios (above, we only order * the completion of the lwb root zios). This is required because of * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb. * * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous * lwb will rely on this lwb to flush the vdevs written to by that * previous lwb. Thus, we need to ensure this lwb doesn't issue the * flush until after the previous lwb's write completes. We ensure * this ordering by setting the zio parent/child relationship here. * * Without this relationship on the lwb's write zio, it's possible * for this lwb's write to complete prior to the previous lwb's write * completing; and thus, the vdevs for the previous lwb would be * flushed prior to that lwb's data being written to those vdevs (the * vdevs are flushed in the lwb write zio's completion handler, * zil_lwb_write_done()). */ if (prev_lwb->lwb_state == LWB_STATE_ISSUED) { ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL); zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio); } else { ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); } ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL); zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio); } /* * This function's purpose is to "open" an lwb such that it is ready to * accept new itxs being committed to it. This function is idempotent; if * the passed in lwb has already been opened, it is essentially a no-op. */ static void zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); if (lwb->lwb_state != LWB_STATE_NEW) { ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); return; } mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_OPENED; zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); } /* * Define a limited set of intent log block sizes. * * These must be a multiple of 4KB. Note only the amount used (again * aligned to 4KB) actually gets written. However, we can't always just * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. */ static const struct { uint64_t limit; uint64_t blksz; } zil_block_buckets[] = { { 4096, 4096 }, /* non TX_WRITE */ { 8192 + 4096, 8192 + 4096 }, /* database */ { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ { 131072, 131072 }, /* < 128KB writes */ { 131072 +4096, 65536 + 4096 }, /* 128KB writes */ { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ }; /* * Maximum block size used by the ZIL. This is picked up when the ZIL is * initialized. Otherwise this should not be used directly; see * zl_max_block_size instead. */ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; /* * Close the log block for being issued and allocate the next one. * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) { int i; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); lwb->lwb_state = LWB_STATE_CLOSED; /* * If there was an allocation failure then returned NULL will trigger * zil_commit_writer_stall() at the caller. This is inherently racy, * since allocation may not have happened yet. */ if (lwb->lwb_error != 0) return (NULL); /* * Log blocks are pre-allocated. Here we select the size of the next * block, based on size used in the last block. * - first find the smallest bucket that will fit the block from a * limited set of block sizes. This is because it's faster to write * blocks allocated from the same metaslab as they are adjacent or * close. * - next find the maximum from the new suggested size and an array of * previous sizes. This lessens a picket fence effect of wrongly * guessing the size if we have a stream of say 2k, 64k, 2k, 64k * requests. * * Note we only write what is used, but we can't just allocate * the maximum block size because we can exhaust the available * pool log space. */ uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) continue; zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, zil_blksz, uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state)); } /* * Finalize previously closed block and issue the write zio. */ static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { spa_t *spa = zilog->zl_spa; zil_chain_t *zilc; boolean_t slog; zbookmark_phys_t zb; zio_priority_t prio; int error; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); /* Actually fill the lwb with the data. */ for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; itx = list_next(&lwb->lwb_itxs, itx)) zil_lwb_commit(zilog, lwb, itx); lwb->lwb_nused = lwb->lwb_nfilled; lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); /* * The lwb is now ready to be issued, but it can be only if it already * got its block pointer allocated or the allocation has failed. * Otherwise leave it as-is, relying on some other thread to issue it * after allocating its block pointer via calling zil_lwb_write_issue() * for the previous lwb(s) in the chain. */ mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_READY; if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) { mutex_exit(&zilog->zl_lock); return; } mutex_exit(&zilog->zl_lock); next_lwb: if (lwb->lwb_slim) zilc = (zil_chain_t *)lwb->lwb_buf; else zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax); int wsz = lwb->lwb_sz; if (lwb->lwb_error == 0) { abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) prio = ZIO_PRIORITY_SYNC_WRITE; else prio = ZIO_PRIORITY_ASYNC_WRITE; SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb); zil_lwb_add_block(lwb, &lwb->lwb_blk); if (lwb->lwb_slim) { /* For Slim ZIL only write what is used. */ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); ASSERT3S(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; } memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; } else { /* * We can't write the lwb if there was an allocation failure, * so create a null zio instead just to maintain dependencies. */ lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL, zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL); lwb->lwb_write_zio->io_error = lwb->lwb_error; } if (lwb->lwb_child_zio) zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio); /* * Open transaction to allocate the next block pointer. */ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); uint64_t txg = dmu_tx_get_txg(tx); /* * Allocate next the block pointer unless we are already in error. */ lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb); blkptr_t *bp = &zilc->zc_next_blk; BP_ZERO(bp); error = lwb->lwb_error; if (error == 0) { error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz, &slog); } if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; } /* * Reduce TXG open time by incrementing inflight counter and committing * the transaciton. zil_sync() will wait for it to return to zero. */ mutex_enter(&zilog->zl_lwb_io_lock); lwb->lwb_issued_txg = txg; zilog->zl_lwb_inflight[txg & TXG_MASK]++; zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); mutex_exit(&zilog->zl_lwb_io_lock); dmu_tx_commit(tx); spa_config_enter(spa, SCL_STATE, lwb, RW_READER); /* * We've completed all potentially blocking operations. Update the * nlwb and allow it proceed without possible lock order reversals. */ mutex_enter(&zilog->zl_lock); zil_lwb_set_zio_dependency(zilog, lwb); lwb->lwb_state = LWB_STATE_ISSUED; if (nlwb) { nlwb->lwb_blk = *bp; nlwb->lwb_error = error; nlwb->lwb_slog = slog; nlwb->lwb_alloc_txg = txg; if (nlwb->lwb_state != LWB_STATE_READY) nlwb = NULL; } mutex_exit(&zilog->zl_lock); if (lwb->lwb_slog) { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, lwb->lwb_nused); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write, wsz); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc, BP_GET_LSIZE(&lwb->lwb_blk)); } else { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, lwb->lwb_nused); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write, wsz); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, BP_GET_LSIZE(&lwb->lwb_blk)); } lwb->lwb_issued_timestamp = gethrtime(); if (lwb->lwb_child_zio) zio_nowait(lwb->lwb_child_zio); zio_nowait(lwb->lwb_write_zio); zio_nowait(lwb->lwb_root_zio); /* * If nlwb was ready when we gave it the block pointer, * it is on us to issue it and possibly following ones. */ lwb = nlwb; if (lwb) goto next_lwb; } /* * Maximum amount of data that can be put into single log block. */ uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize) { return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize); } /* * Maximum amount of log space we agree to waste to reduce number of * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%). */ static inline uint64_t zil_max_waste_space(zilog_t *zilog) { return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16); } /* * Maximum amount of write data for WR_COPIED. For correctness, consumers * must fall back to WR_NEED_COPY if we can't fit the entire record into one * maximum sized log block, because each WR_COPIED record must fit in a * single log block. Below that it is a tradeoff of additional memory copy * and possibly worse log space efficiency vs additional range lock/unlock. */ static uint_t zil_maxcopied = 7680; uint64_t zil_max_copied_data(zilog_t *zilog) { uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t)); return (MIN(max_data, zil_maxcopied)); } /* * Estimate space needed in the lwb for the itx. Allocate more lwbs or * split the itx as needed, but don't touch the actual transaction data. * Has to be called under zl_issuer_lock to call zil_lwb_write_close() * to chain more lwbs. */ static lwb_t * zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) { itx_t *citx; lr_t *lr, *clr; lr_write_t *lrw; uint64_t dlen, dnow, lwb_sp, reclen, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); ASSERT3P(lwb->lwb_buf, !=, NULL); zil_lwb_write_open(zilog, lwb); lr = &itx->itx_lr; lrw = (lr_write_t *)lr; /* * A commit itx doesn't represent any on-disk state; instead * it's simply used as a place holder on the commit list, and * provides a mechanism for attaching a "commit waiter" onto the * correct lwb (such that the waiter can be signalled upon * completion of that lwb). Thus, we don't process this itx's * log record if it's a commit itx (these itx's don't have log * records), and instead link the itx's waiter onto the lwb's * list of waiters. * * For more details, see the comment above zil_commit(). */ if (lr->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_lwb(itx->itx_private, lwb); list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); } else { dlen = 0; } reclen = lr->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); cont: /* * If this record won't fit in the current log block, start a new one. * For WR_NEED_COPY optimize layout for minimal number of chunks. */ lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t)); if (reclen > lwb_sp || (reclen + dlen > lwb_sp && lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED); if (lwb == NULL) return (NULL); lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; /* * There must be enough space in the new, empty log block to * hold reclen. For WR_COPIED, we need to fit the whole * record in one block, and reclen is the header size + the * data size. For WR_NEED_COPY, we can create multiple * records, splitting the data into multiple blocks, so we * only need to fit one word of data per block; in this case * reclen is just the header size (no data). */ ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } dnow = MIN(dlen, lwb_sp - reclen); if (dlen > dnow) { ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY); citx = zil_itx_clone(itx); clr = &citx->itx_lr; lr_write_t *clrw = (lr_write_t *)clr; clrw->lr_length = dnow; lrw->lr_offset += dnow; lrw->lr_length -= dnow; } else { citx = itx; clr = lr; } /* * We're actually making an entry, so update lrc_seq to be the * log record sequence number. Note that this is generally not * equal to the itx sequence number because not all transactions * are synchronous, and sometimes spa_sync() gets there first. */ clr->lrc_seq = ++zilog->zl_lr_seq; lwb->lwb_nused += reclen + dnow; ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); zil_lwb_add_txg(lwb, lr->lrc_txg); list_insert_tail(&lwb->lwb_itxs, citx); dlen -= dnow; if (dlen > 0) { zilog->zl_cur_used += reclen; goto cont; } if (lr->lrc_txtype == TX_WRITE && lr->lrc_txg > spa_freeze_txg(zilog->zl_spa)) txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); return (lwb); } /* * Fill the actual transaction data into the lwb, following zil_lwb_assign(). * Does not require locking. */ static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) { lr_t *lr, *lrb; lr_write_t *lrw, *lrwb; char *lr_buf; uint64_t dlen, reclen; lr = &itx->itx_lr; lrw = (lr_write_t *)lr; if (lr->lrc_txtype == TX_COMMIT) return; if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); } else { dlen = 0; } reclen = lr->lrc_reclen; ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; memcpy(lr_buf, lr, reclen); lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */ lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */ ZIL_STAT_BUMP(zilog, zil_itx_count); /* * If it's a write, fetch the data or get its blkptr as appropriate. */ if (lr->lrc_txtype == TX_WRITE) { if (itx->itx_wr_state == WR_COPIED) { ZIL_STAT_BUMP(zilog, zil_itx_copied_count); ZIL_STAT_INCR(zilog, zil_itx_copied_bytes, lrw->lr_length); } else { char *dbuf; int error; if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; lrb->lrc_reclen += dlen; ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count); ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes, dlen); } else { ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); dbuf = NULL; ZIL_STAT_BUMP(zilog, zil_itx_indirect_count); ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes, lrw->lr_length); if (lwb->lwb_child_zio == NULL) { lwb->lwb_child_zio = zio_root( zilog->zl_spa, NULL, NULL, ZIO_FLAG_CANFAIL); } } /* * The "lwb_child_zio" we pass in will become a child of * "lwb_write_zio", when one is created, so one will be * a parent of any zio's created by the "zl_get_data". * This way "lwb_write_zio" will first wait for children * block pointers before own writing, and then for their * writing completion before the vdev cache flushing. */ error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, lwb->lwb_child_zio); if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ memset((char *)dbuf + lrwb->lr_length, 0, dlen - lrwb->lr_length); } /* * Typically, the only return values we should see from * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or * EALREADY. However, it is also possible to see other * error values such as ENOSPC or EINVAL from * dmu_read() -> dnode_hold() -> dnode_hold_impl() or * ENXIO as well as a multitude of others from the * block layer through dmu_buf_hold() -> dbuf_read() * -> zio_wait(), as well as through dmu_read() -> * dnode_hold() -> dnode_hold_impl() -> dbuf_read() -> * zio_wait(). When these errors happen, we can assume * that neither an immediate write nor an indirect * write occurred, so we need to fall back to * txg_wait_synced(). This is unusual, so we print to * dmesg whenever one of these errors occurs. */ switch (error) { case 0: break; default: cmn_err(CE_WARN, "zil_lwb_commit() received " "unexpected error %d from ->zl_get_data()" ". Falling back to txg_wait_synced().", error); zfs_fallthrough; case EIO: txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); zfs_fallthrough; case ENOENT: zfs_fallthrough; case EEXIST: zfs_fallthrough; case EALREADY: return; } } } lwb->lwb_nfilled += reclen + dlen; ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); } itx_t * zil_itx_create(uint64_t txtype, size_t olrsize) { size_t itxsize, lrsize; itx_t *itx; lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t); itxsize = offsetof(itx_t, itx_lr) + lrsize; itx = zio_data_buf_alloc(itxsize); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; itx->itx_lr.lrc_seq = 0; /* defensive */ memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize); itx->itx_sync = B_TRUE; /* default is synchronous */ itx->itx_callback = NULL; itx->itx_callback_data = NULL; itx->itx_size = itxsize; return (itx); } static itx_t * zil_itx_clone(itx_t *oitx) { itx_t *itx = zio_data_buf_alloc(oitx->itx_size); memcpy(itx, oitx, oitx->itx_size); itx->itx_callback = NULL; itx->itx_callback_data = NULL; return (itx); } void zil_itx_destroy(itx_t *itx) { IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL); IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); if (itx->itx_callback != NULL) itx->itx_callback(itx->itx_callback_data); zio_data_buf_free(itx, itx->itx_size); } /* * Free up the sync and async itxs. The itxs_t has already been detached * so no locks are needed. */ static void zil_itxg_clean(void *arg) { itx_t *itx; list_t *list; avl_tree_t *t; void *cookie; itxs_t *itxs = arg; itx_async_node_t *ian; list = &itxs->i_sync_list; while ((itx = list_remove_head(list)) != NULL) { /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via * zil_lwb_assign(), and free'd in that function. Having * said that, it is still possible for commit itxs to be * found here, due to the following race: * * - a thread calls zil_commit() which assigns the * commit itx to a per-txg i_sync_list * - zil_itxg_clean() is called (e.g. via spa_sync()) * while the waiter is still on the i_sync_list * * There's nothing to prevent syncing the txg while the * waiter is on the i_sync_list. This normally doesn't * happen because spa_sync() is slower than zil_commit(), * but if zil_commit() calls txg_wait_synced() (e.g. * because zil_create() or zil_commit_writer_stall() is * called) we will hit this case. */ if (itx->itx_lr.lrc_txtype == TX_COMMIT) zil_commit_waiter_skip(itx->itx_private); zil_itx_destroy(itx); } cookie = NULL; t = &itxs->i_async_tree; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; while ((itx = list_remove_head(list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } list_destroy(list); kmem_free(ian, sizeof (itx_async_node_t)); } avl_destroy(t); kmem_free(itxs, sizeof (itxs_t)); } static int zil_aitx_compare(const void *x1, const void *x2) { const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; return (TREE_CMP(o1, o2)); } /* * Remove all async itx with the given oid. */ void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; list_t clean_list; itx_t *itx; ASSERT(oid != 0); list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; ian = avl_find(t, &oid, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); } while ((itx = list_remove_head(&clean_list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } list_destroy(&clean_list); } void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) { uint64_t txg; itxg_t *itxg; itxs_t *itxs, *clean = NULL; /* * Ensure the data of a renamed file is committed before the rename. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) zil_async_to_sync(zilog, itx->itx_oid); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) txg = ZILTEST_TXG; else txg = dmu_tx_get_txg(tx); itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); itxs = itxg->itxg_itxs; if (itxg->itxg_txg != txg) { if (itxs != NULL) { /* * The zil_clean callback hasn't got around to cleaning * this itxg. Save the itxs for release below. * This should be rare. */ zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " "txg %llu", (u_longlong_t)itxg->itxg_txg); clean = itxg->itxg_itxs; } itxg->itxg_txg = txg; itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); list_create(&itxs->i_sync_list, sizeof (itx_t), offsetof(itx_t, itx_node)); avl_create(&itxs->i_async_tree, zil_aitx_compare, sizeof (itx_async_node_t), offsetof(itx_async_node_t, ia_node)); } if (itx->itx_sync) { list_insert_tail(&itxs->i_sync_list, itx); } else { avl_tree_t *t = &itxs->i_async_tree; uint64_t foid = LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid); itx_async_node_t *ian; avl_index_t where; ian = avl_find(t, &foid, &where); if (ian == NULL) { ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); list_create(&ian->ia_list, sizeof (itx_t), offsetof(itx_t, itx_node)); ian->ia_foid = foid; avl_insert(t, ian, where); } list_insert_tail(&ian->ia_list, itx); } itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); /* * We don't want to dirty the ZIL using ZILTEST_TXG, because * zil_clean() will never be called using ZILTEST_TXG. Thus, we * need to be careful to always dirty the ZIL using the "real" * TXG (not itxg_txg) even when the SPA is frozen. */ zilog_dirty(zilog, dmu_tx_get_txg(tx)); mutex_exit(&itxg->itxg_lock); /* Release the old itxs now we've dropped the lock */ if (clean != NULL) zil_itxg_clean(clean); } /* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. We should only do this after we * have written out the uberblocks (i.e. txg has been committed) so that * don't inadvertently clean out in-memory log records that would be required * by zil_commit(). */ void zil_clean(zilog_t *zilog, uint64_t synced_txg) { itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; itxs_t *clean_me; ASSERT3U(synced_txg, <, ZILTEST_TXG); mutex_enter(&itxg->itxg_lock); if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); return; } ASSERT3U(itxg->itxg_txg, <=, synced_txg); ASSERT3U(itxg->itxg_txg, !=, 0); clean_me = itxg->itxg_itxs; itxg->itxg_itxs = NULL; itxg->itxg_txg = 0; mutex_exit(&itxg->itxg_lock); /* * Preferably start a task queue to free up the old itxs but * if taskq_dispatch can't allocate resources to do that then * free it in-line. This should be rare. Note, using TQ_SLEEP * created a bad performance problem. */ ASSERT3P(zilog->zl_dmu_pool, !=, NULL); ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, zil_itxg_clean, clean_me, TQ_NOSLEEP); if (id == TASKQID_INVALID) zil_itxg_clean(clean_me); } /* * This function will traverse the queue of itxs that need to be * committed, and move them onto the ZIL's zl_itx_commit_list. */ static uint64_t zil_get_commit_list(zilog_t *zilog) { uint64_t otxg, txg, wtxg = 0; list_t *commit_list = &zilog->zl_itx_commit_list; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; /* * This is inherently racy, since there is nothing to prevent * the last synced txg from changing. That's okay since we'll * only commit things in the future. */ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If we're adding itx records to the zl_itx_commit_list, * then the zil better be dirty in this "txg". We can assert * that here since we're holding the itxg_lock which will * prevent spa_sync from cleaning it. Once we add the itxs * to the zl_itx_commit_list we must commit it to disk even * if it's unnecessary (i.e. the txg was synced). */ ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); list_t *sync_list = &itxg->itxg_itxs->i_sync_list; if (unlikely(zilog->zl_suspend > 0)) { /* * ZIL was just suspended, but we lost the race. * Allow all earlier itxs to be committed, but ask * caller to do txg_wait_synced(txg) for any new. */ if (!list_is_empty(sync_list)) wtxg = MAX(wtxg, txg); } else { list_move_tail(commit_list, sync_list); } mutex_exit(&itxg->itxg_lock); } return (wtxg); } /* * Move the async itxs for a specified object to commit into sync lists. */ void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; /* * This is inherently racy, since there is nothing to prevent * the last synced txg from changing. */ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If a foid is specified then find that node and append its * list. Otherwise walk the tree appending all the lists * to the sync list. We add to the end rather than the * beginning to ensure the create has happened. */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { ian = avl_find(t, &foid, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); } } else { void *cookie = NULL; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); list_destroy(&ian->ia_list); kmem_free(ian, sizeof (itx_async_node_t)); } } mutex_exit(&itxg->itxg_lock); } } /* * This function will prune commit itxs that are at the head of the * commit list (it won't prune past the first non-commit itx), and * either: a) attach them to the last lwb that's still pending * completion, or b) skip them altogether. * * This is used as a performance optimization to prevent commit itxs * from generating new lwbs when it's unnecessary to do so. */ static void zil_prune_commit_list(zilog_t *zilog) { itx_t *itx; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; if (lrc->lrc_txtype != TX_COMMIT) break; mutex_enter(&zilog->zl_lock); lwb_t *last_lwb = zilog->zl_last_lwb_opened; if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) { /* * All of the itxs this waiter was waiting on * must have already completed (or there were * never any itx's for it to wait on), so it's * safe to skip this waiter and mark it done. */ zil_commit_waiter_skip(itx->itx_private); } else { zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); } mutex_exit(&zilog->zl_lock); list_remove(&zilog->zl_itx_commit_list, itx); zil_itx_destroy(itx); } IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); } static void zil_commit_writer_stall(zilog_t *zilog) { /* * When zio_alloc_zil() fails to allocate the next lwb block on * disk, we must call txg_wait_synced() to ensure all of the * lwbs in the zilog's zl_lwb_list are synced and then freed (in * zil_sync()), such that any subsequent ZIL writer (i.e. a call * to zil_process_commit_list()) will have to call zil_create(), * and start a new ZIL chain. * * Since zil_alloc_zil() failed, the lwb that was previously * issued does not have a pointer to the "next" lwb on disk. * Thus, if another ZIL writer thread was to allocate the "next" * on-disk lwb, that block could be leaked in the event of a * crash (because the previous lwb on-disk would not point to * it). * * We must hold the zilog's zl_issuer_lock while we do this, to * ensure no new threads enter zil_process_commit_list() until * all lwb's in the zl_lwb_list have been synced and freed * (which is achieved via the txg_wait_synced() call). */ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); txg_wait_synced(zilog->zl_dmu_pool, 0); ASSERT(list_is_empty(&zilog->zl_lwb_list)); } /* * This function will traverse the commit list, creating new lwbs as * needed, and committing the itxs from the commit list to these newly * created lwbs. Additionally, as a new lwb is created, the previous * lwb will be issued to the zio layer to be written to disk. */ static void zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) { spa_t *spa = zilog->zl_spa; list_t nolwb_itxs; list_t nolwb_waiters; lwb_t *lwb, *plwb; itx_t *itx; boolean_t first = B_TRUE; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); /* * Return if there's nothing to commit before we dirty the fs by * calling zil_create(). */ if (list_is_empty(&zilog->zl_itx_commit_list)) return; list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) { lwb = zil_create(zilog); } else { /* * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will * have already been created (zl_lwb_list not empty). */ zil_commit_activate_saxattr_feature(zilog); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_OPENED); first = (lwb->lwb_state == LWB_STATE_NEW) && ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL || plwb->lwb_state == LWB_STATE_FLUSH_DONE); } while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; uint64_t txg = lrc->lrc_txg; ASSERT3U(txg, !=, 0); if (lrc->lrc_txtype == TX_COMMIT) { DTRACE_PROBE2(zil__process__commit__itx, zilog_t *, zilog, itx_t *, itx); } else { DTRACE_PROBE2(zil__process__normal__itx, zilog_t *, zilog, itx_t *, itx); } boolean_t synced = txg <= spa_last_synced_txg(spa); boolean_t frozen = txg > spa_freeze_txg(spa); /* * If the txg of this itx has already been synced out, then * we don't need to commit this itx to an lwb. This is * because the data of this itx will have already been * written to the main pool. This is inherently racy, and * it's still ok to commit an itx whose txg has already * been synced; this will result in a write that's * unnecessary, but will do no harm. * * With that said, we always want to commit TX_COMMIT itxs * to an lwb, regardless of whether or not that itx's txg * has been synced out. We do this to ensure any OPENED lwb * will always have at least one zil_commit_waiter_t linked * to the lwb. * * As a counter-example, if we skipped TX_COMMIT itx's * whose txg had already been synced, the following * situation could occur if we happened to be racing with * spa_sync: * * 1. We commit a non-TX_COMMIT itx to an lwb, where the * itx's txg is 10 and the last synced txg is 9. * 2. spa_sync finishes syncing out txg 10. * 3. We move to the next itx in the list, it's a TX_COMMIT * whose txg is 10, so we skip it rather than committing * it to the lwb used in (1). * * If the itx that is skipped in (3) is the last TX_COMMIT * itx in the commit list, than it's possible for the lwb * used in (1) to remain in the OPENED state indefinitely. * * To prevent the above scenario from occurring, ensuring * that once an lwb is OPENED it will transition to ISSUED * and eventually DONE, we always commit TX_COMMIT itx's to * an lwb here, even if that itx's txg has already been * synced. * * Finally, if the pool is frozen, we _always_ commit the * itx. The point of freezing the pool is to prevent data * from being written to the main pool via spa_sync, and * instead rely solely on the ZIL to persistently store the * data; i.e. when the pool is frozen, the last synced txg * value can't be trusted. */ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs); if (lwb == NULL) { list_insert_tail(&nolwb_itxs, itx); } else if ((zcw->zcw_lwb != NULL && zcw->zcw_lwb != lwb) || zcw->zcw_done) { /* * Our lwb is done, leave the rest of * itx list to somebody else who care. */ first = B_FALSE; break; } } else { if (lrc->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } list_insert_tail(&nolwb_itxs, itx); } } else { ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } } if (lwb == NULL) { /* * This indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this happens, we must stall * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. */ while ((lwb = list_remove_head(ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); /* * Additionally, we have to signal and mark the "nolwb" * waiters as "done" here, since without an lwb, we * can't do this via zil_lwb_flush_vdevs_done() like * normal. */ zil_commit_waiter_t *zcw; while ((zcw = list_remove_head(&nolwb_waiters)) != NULL) zil_commit_waiter_skip(zcw); /* * And finally, we have to destroy the itx's that * couldn't be committed to an lwb; this will also call * the itx's callback if one exists for the itx. */ while ((itx = list_remove_head(&nolwb_itxs)) != NULL) zil_itx_destroy(itx); } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_OPENED); /* * At this point, the ZIL block pointed at by the "lwb" * variable is in "new" or "opened" state. * * If it's "new", then no itxs have been committed to it, so * there's no point in issuing its zio (i.e. it's "empty"). * * If it's "opened", then it contains one or more itxs that * eventually need to be committed to stable storage. In * this case we intentionally do not issue the lwb's zio * to disk yet, and instead rely on one of the following * two mechanisms for issuing the zio: * * 1. Ideally, there will be more ZIL activity occurring on * the system, such that this function will be immediately * called again by different thread and this lwb will be * closed by zil_lwb_assign(). This way, the lwb will be * "full" when it is issued to disk, and we'll make use of * the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on * the system, zil_commit_waiter() will close it and issue * the zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount * of ZIL activity occurring on the system at that time. * * We do this for a couple of reasons: * * 1. To try and reduce the number of IOPs needed to * write the same number of itxs. If an lwb has space * available in its buffer for more itxs, and more itxs * will be committed relatively soon (relative to the * latency of performing a write), then it's beneficial * to wait for these "next" itxs. This way, more itxs * can be committed to stable storage with fewer writes. * * 2. To try and use the largest lwb block size that the * incoming rate of itxs can support. Again, this is to * try and pack as many itxs into as few lwbs as * possible, without significantly impacting the latency * of each individual itx. * * If we had no already running or open LWBs, it can be * the workload is single-threaded. And if the ZIL write * latency is very small or if the LWB is almost full, it * may be cheaper to bypass the delay. */ if (lwb->lwb_state == LWB_STATE_OPENED && first) { hrtime_t sleep = zilog->zl_last_lwb_latency * zfs_commit_timeout_pct / 100; if (sleep < zil_min_commit_timeout || lwb->lwb_nmax - lwb->lwb_nused < lwb->lwb_nmax / 8) { list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); zilog->zl_cur_used = 0; if (lwb == NULL) { while ((lwb = list_remove_head(ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); } } } } } /* * This function is responsible for ensuring the passed in commit waiter * (and associated commit itx) is committed to an lwb. If the waiter is * not already committed to an lwb, all itxs in the zilog's queue of * itxs will be processed. The assumption is the passed in waiter's * commit itx will found in the queue just like the other non-commit * itxs, such that when the entire queue is processed, the waiter will * have been committed to an lwb. * * The lwb associated with the passed in waiter is not guaranteed to * have been issued by the time this function completes. If the lwb is * not issued, we rely on future calls to zil_commit_writer() to issue * the lwb, or the timeout mechanism found in zil_commit_waiter(). */ static uint64_t zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { list_t ilwbs; lwb_t *lwb; uint64_t wtxg = 0; ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node)); mutex_enter(&zilog->zl_issuer_lock); if (zcw->zcw_lwb != NULL || zcw->zcw_done) { /* * It's possible that, while we were waiting to acquire * the "zl_issuer_lock", another thread committed this * waiter to an lwb. If that occurs, we bail out early, * without processing any of the zilog's queue of itxs. * * On certain workloads and system configurations, the * "zl_issuer_lock" can become highly contended. In an * attempt to reduce this contention, we immediately drop * the lock if the waiter has already been processed. * * We've measured this optimization to reduce CPU spent * contending on this lock by up to 5%, using a system * with 32 CPUs, low latency storage (~50 usec writes), * and 1024 threads performing sync writes. */ goto out; } ZIL_STAT_BUMP(zilog, zil_commit_writer_count); wtxg = zil_get_commit_list(zilog); zil_prune_commit_list(zilog); zil_process_commit_list(zilog, zcw, &ilwbs); out: mutex_exit(&zilog->zl_issuer_lock); while ((lwb = list_remove_head(&ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); list_destroy(&ilwbs); return (wtxg); } static void zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); ASSERT3B(zcw->zcw_done, ==, B_FALSE); lwb_t *lwb = zcw->zcw_lwb; ASSERT3P(lwb, !=, NULL); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); /* * If the lwb has already been issued by another thread, we can * immediately return since there's no work to be done (the * point of this function is to issue the lwb). Additionally, we * do this prior to acquiring the zl_issuer_lock, to avoid * acquiring it when it's not necessary to do so. */ if (lwb->lwb_state != LWB_STATE_OPENED) return; /* * In order to call zil_lwb_write_close() we must hold the * zilog's "zl_issuer_lock". We can't simply acquire that lock, * since we're already holding the commit waiter's "zcw_lock", * and those two locks are acquired in the opposite order * elsewhere. */ mutex_exit(&zcw->zcw_lock); mutex_enter(&zilog->zl_issuer_lock); mutex_enter(&zcw->zcw_lock); /* * Since we just dropped and re-acquired the commit waiter's * lock, we have to re-check to see if the waiter was marked * "done" during that process. If the waiter was marked "done", * the "lwb" pointer is no longer valid (it can be free'd after * the waiter is marked "done"), so without this check we could * wind up with a use-after-free error below. */ if (zcw->zcw_done) { mutex_exit(&zilog->zl_issuer_lock); return; } ASSERT3P(lwb, ==, zcw->zcw_lwb); /* * We've already checked this above, but since we hadn't acquired * the zilog's zl_issuer_lock, we have to perform this check a * second time while holding the lock. * * We don't need to hold the zl_lock since the lwb cannot transition * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb * _can_ transition from CLOSED to DONE, but it's OK to race with * that transition since we treat the lwb the same, whether it's in * the CLOSED, ISSUED or DONE states. * * The important thing, is we treat the lwb differently depending on * if it's OPENED or CLOSED, and block any other threads that might * attempt to close/issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call * zil_lwb_write_close() if the lwb had already been closed/issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ if (lwb->lwb_state != LWB_STATE_OPENED) { mutex_exit(&zilog->zl_issuer_lock); return; } /* * We do not need zcw_lock once we hold zl_issuer_lock and know lwb * is still open. But we have to drop it to avoid a deadlock in case * callback of zio issued by zil_lwb_write_issue() try to get it, * while zil_lwb_write_issue() is blocked on attempt to issue next * lwb it found in LWB_STATE_READY state. */ mutex_exit(&zcw->zcw_lock); /* * As described in the comments above zil_commit_waiter() and * zil_process_commit_list(), we need to issue this lwb's zio * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); /* * Since the lwb's zio hadn't been issued by the time this thread * reached its timeout, we reset the zilog's "zl_cur_used" field * to influence the zil block size selection algorithm. * * By having to issue the lwb's zio here, it means the size of the * lwb was too large, given the incoming throughput of itxs. By * setting "zl_cur_used" to zero, we communicate this fact to the * block size selection algorithm, so it can take this information * into account, and potentially select a smaller size for the * next lwb block that is allocated. */ zilog->zl_cur_used = 0; if (nlwb == NULL) { /* * When zil_lwb_write_close() returns NULL, this * indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the * zil_commit_writer_stall() function for more details. */ zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); mutex_exit(&zilog->zl_issuer_lock); } else { mutex_exit(&zilog->zl_issuer_lock); zil_lwb_write_issue(zilog, lwb); } mutex_enter(&zcw->zcw_lock); } /* * This function is responsible for performing the following two tasks: * * 1. its primary responsibility is to block until the given "commit * waiter" is considered "done". * * 2. its secondary responsibility is to issue the zio for the lwb that * the given "commit waiter" is waiting on, if this function has * waited "long enough" and the lwb is still in the "open" state. * * Given a sufficient amount of itxs being generated and written using * the ZIL, the lwb's zio will be issued via the zil_lwb_assign() * function. If this does not occur, this secondary responsibility will * ensure the lwb is issued even if there is not other synchronous * activity on the system. * * For more details, see zil_process_commit_list(); more specifically, * the comment at the bottom of that function. */ static void zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(spa_writeable(zilog->zl_spa)); mutex_enter(&zcw->zcw_lock); /* * The timeout is scaled based on the lwb latency to avoid * significantly impacting the latency of each individual itx. * For more details, see the comment at the bottom of the * zil_process_commit_list() function. */ int pct = MAX(zfs_commit_timeout_pct, 1); hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; hrtime_t wakeup = gethrtime() + sleep; boolean_t timedout = B_FALSE; while (!zcw->zcw_done) { ASSERT(MUTEX_HELD(&zcw->zcw_lock)); lwb_t *lwb = zcw->zcw_lwb; /* * Usually, the waiter will have a non-NULL lwb field here, * but it's possible for it to be NULL as a result of * zil_commit() racing with spa_sync(). * * When zil_clean() is called, it's possible for the itxg * list (which may be cleaned via a taskq) to contain * commit itxs. When this occurs, the commit waiters linked * off of these commit itxs will not be committed to an * lwb. Additionally, these commit waiters will not be * marked done until zil_commit_waiter_skip() is called via * zil_itxg_clean(). * * Thus, it's possible for this commit waiter (i.e. the * "zcw" variable) to be found in this "in between" state; * where it's "zcw_lwb" field is NULL, and it hasn't yet * been skipped, so it's "zcw_done" field is still B_FALSE. */ IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW); if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { ASSERT3B(timedout, ==, B_FALSE); /* * If the lwb hasn't been issued yet, then we * need to wait with a timeout, in case this * function needs to issue the lwb after the * timeout is reached; responsibility (2) from * the comment above this function. */ int rc = cv_timedwait_hires(&zcw->zcw_cv, &zcw->zcw_lock, wakeup, USEC2NSEC(1), CALLOUT_FLAG_ABSOLUTE); if (rc != -1 || zcw->zcw_done) continue; timedout = B_TRUE; zil_commit_waiter_timeout(zilog, zcw); if (!zcw->zcw_done) { /* * If the commit waiter has already been * marked "done", it's possible for the * waiter's lwb structure to have already * been freed. Thus, we can only reliably * make these assertions if the waiter * isn't done. */ ASSERT3P(lwb, ==, zcw->zcw_lwb); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); } } else { /* * If the lwb isn't open, then it must have already * been issued. In that case, there's no need to * use a timeout when waiting for the lwb to * complete. * * Additionally, if the lwb is NULL, the waiter * will soon be signaled and marked done via * zil_clean() and zil_itxg_clean(), so no timeout * is required. */ IMPLY(lwb != NULL, lwb->lwb_state == LWB_STATE_CLOSED || lwb->lwb_state == LWB_STATE_READY || lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE); cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); } } mutex_exit(&zcw->zcw_lock); } static zil_commit_waiter_t * zil_alloc_commit_waiter(void) { zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&zcw->zcw_node); zcw->zcw_lwb = NULL; zcw->zcw_done = B_FALSE; zcw->zcw_zio_error = 0; return (zcw); } static void zil_free_commit_waiter(zil_commit_waiter_t *zcw) { ASSERT(!list_link_active(&zcw->zcw_node)); ASSERT3P(zcw->zcw_lwb, ==, NULL); ASSERT3B(zcw->zcw_done, ==, B_TRUE); mutex_destroy(&zcw->zcw_lock); cv_destroy(&zcw->zcw_cv); kmem_cache_free(zil_zcw_cache, zcw); } /* * This function is used to create a TX_COMMIT itx and assign it. This * way, it will be linked into the ZIL's list of synchronous itxs, and * then later committed to an lwb (or skipped) when * zil_process_commit_list() is called. */ static void zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) { dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); /* * Since we are not going to create any new dirty data, and we * can even help with clearing the existing dirty data, we * should not be subject to the dirty data based delays. We * use TXG_NOTHROTTLE to bypass the delay mechanism. */ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); itx->itx_sync = B_TRUE; itx->itx_private = zcw; zil_itx_assign(zilog, itx, tx); dmu_tx_commit(tx); } /* * Commit ZFS Intent Log transactions (itxs) to stable storage. * * When writing ZIL transactions to the on-disk representation of the * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple * itxs can be committed to a single lwb. Once a lwb is written and * committed to stable storage (i.e. the lwb is written, and vdevs have * been flushed), each itx that was committed to that lwb is also * considered to be committed to stable storage. * * When an itx is committed to an lwb, the log record (lr_t) contained * by the itx is copied into the lwb's zio buffer, and once this buffer * is written to disk, it becomes an on-disk ZIL block. * * As itxs are generated, they're inserted into the ZIL's queue of * uncommitted itxs. The semantics of zil_commit() are such that it will * block until all itxs that were in the queue when it was called, are * committed to stable storage. * * If "foid" is zero, this means all "synchronous" and "asynchronous" * itxs, for all objects in the dataset, will be committed to stable * storage prior to zil_commit() returning. If "foid" is non-zero, all * "synchronous" itxs for all objects, but only "asynchronous" itxs * that correspond to the foid passed in, will be committed to stable * storage prior to zil_commit() returning. * * Generally speaking, when zil_commit() is called, the consumer doesn't * actually care about _all_ of the uncommitted itxs. Instead, they're * simply trying to waiting for a specific itx to be committed to disk, * but the interface(s) for interacting with the ZIL don't allow such * fine-grained communication. A better interface would allow a consumer * to create and assign an itx, and then pass a reference to this itx to * zil_commit(); such that zil_commit() would return as soon as that * specific itx was committed to disk (instead of waiting for _all_ * itxs to be committed). * * When a thread calls zil_commit() a special "commit itx" will be * generated, along with a corresponding "waiter" for this commit itx. * zil_commit() will wait on this waiter's CV, such that when the waiter * is marked done, and signaled, zil_commit() will return. * * This commit itx is inserted into the queue of uncommitted itxs. This * provides an easy mechanism for determining which itxs were in the * queue prior to zil_commit() having been called, and which itxs were * added after zil_commit() was called. * * The commit itx is special; it doesn't have any on-disk representation. * When a commit itx is "committed" to an lwb, the waiter associated * with it is linked onto the lwb's list of waiters. Then, when that lwb * completes, each waiter on the lwb's list is marked done and signaled * -- allowing the thread waiting on the waiter to return from zil_commit(). * * It's important to point out a few critical factors that allow us * to make use of the commit itxs, commit waiters, per-lwb lists of * commit waiters, and zio completion callbacks like we're doing: * * 1. The list of waiters for each lwb is traversed, and each commit * waiter is marked "done" and signaled, in the zio completion * callback of the lwb's zio[*]. * * * Actually, the waiters are signaled in the zio completion * callback of the root zio for the DKIOCFLUSHWRITECACHE commands * that are sent to the vdevs upon completion of the lwb zio. * * 2. When the itxs are inserted into the ZIL's queue of uncommitted * itxs, the order in which they are inserted is preserved[*]; as * itxs are added to the queue, they are added to the tail of * in-memory linked lists. * * When committing the itxs to lwbs (to be written to disk), they * are committed in the same order in which the itxs were added to * the uncommitted queue's linked list(s); i.e. the linked list of * itxs to commit is traversed from head to tail, and each itx is * committed to an lwb in that order. * * * To clarify: * * - the order of "sync" itxs is preserved w.r.t. other * "sync" itxs, regardless of the corresponding objects. * - the order of "async" itxs is preserved w.r.t. other * "async" itxs corresponding to the same object. * - the order of "async" itxs is *not* preserved w.r.t. other * "async" itxs corresponding to different objects. * - the order of "sync" itxs w.r.t. "async" itxs (or vice * versa) is *not* preserved, even for itxs that correspond * to the same object. * * For more details, see: zil_itx_assign(), zil_async_to_sync(), * zil_get_commit_list(), and zil_process_commit_list(). * * 3. The lwbs represent a linked list of blocks on disk. Thus, any * lwb cannot be considered committed to stable storage, until its * "previous" lwb is also committed to stable storage. This fact, * coupled with the fact described above, means that itxs are * committed in (roughly) the order in which they were generated. * This is essential because itxs are dependent on prior itxs. * Thus, we *must not* deem an itx as being committed to stable * storage, until *all* prior itxs have also been committed to * stable storage. * * To enforce this ordering of lwb zio's, while still leveraging as * much of the underlying storage performance as possible, we rely * on two fundamental concepts: * * 1. The creation and issuance of lwb zio's is protected by * the zilog's "zl_issuer_lock", which ensures only a single * thread is creating and/or issuing lwb's at a time * 2. The "previous" lwb is a child of the "current" lwb * (leveraging the zio parent-child dependency graph) * * By relying on this parent-child zio relationship, we can have * many lwb zio's concurrently issued to the underlying storage, * but the order in which they complete will be the same order in * which they were created. */ void zil_commit(zilog_t *zilog, uint64_t foid) { /* * We should never attempt to call zil_commit on a snapshot for * a couple of reasons: * * 1. A snapshot may never be modified, thus it cannot have any * in-flight itxs that would have modified the dataset. * * 2. By design, when zil_commit() is called, a commit itx will * be assigned to this zilog; as a result, the zilog will be * dirtied. We must not dirty the zilog of a snapshot; there's * checks in the code that enforce this invariant, and will * cause a panic if it's not upheld. */ ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); if (zilog->zl_sync == ZFS_SYNC_DISABLED) return; if (!spa_writeable(zilog->zl_spa)) { /* * If the SPA is not writable, there should never be any * pending itxs waiting to be committed to disk. If that * weren't true, we'd skip writing those itxs out, and * would break the semantics of zil_commit(); thus, we're * verifying that truth before we return to the caller. */ ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); for (int i = 0; i < TXG_SIZE; i++) ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); return; } /* * If the ZIL is suspended, we don't want to dirty it by calling * zil_commit_itx_assign() below, nor can we write out * lwbs like would be done in zil_commit_write(). Thus, we * simply rely on txg_wait_synced() to maintain the necessary * semantics, and avoid calling those functions altogether. */ if (zilog->zl_suspend > 0) { txg_wait_synced(zilog->zl_dmu_pool, 0); return; } zil_commit_impl(zilog, foid); } void zil_commit_impl(zilog_t *zilog, uint64_t foid) { ZIL_STAT_BUMP(zilog, zil_commit_count); /* * Move the "async" itxs for the specified foid to the "sync" * queues, such that they will be later committed (or skipped) * to an lwb when zil_process_commit_list() is called. * * Since these "async" itxs must be committed prior to this * call to zil_commit returning, we must perform this operation * before we call zil_commit_itx_assign(). */ zil_async_to_sync(zilog, foid); /* * We allocate a new "waiter" structure which will initially be * linked to the commit itx using the itx's "itx_private" field. * Since the commit itx doesn't represent any on-disk state, * when it's committed to an lwb, rather than copying the its * lr_t into the lwb's buffer, the commit itx's "waiter" will be * added to the lwb's list of waiters. Then, when the lwb is * committed to stable storage, each waiter in the lwb's list of * waiters will be marked "done", and signalled. * * We must create the waiter and assign the commit itx prior to * calling zil_commit_writer(), or else our specific commit itx * is not guaranteed to be committed to an lwb prior to calling * zil_commit_waiter(). */ zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); zil_commit_itx_assign(zilog, zcw); uint64_t wtxg = zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); if (zcw->zcw_zio_error != 0) { /* * If there was an error writing out the ZIL blocks that * this thread is waiting on, then we fallback to * relying on spa_sync() to write out the data this * thread is waiting on. Obviously this has performance * implications, but the expectation is for this to be * an exceptional case, and shouldn't occur often. */ DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); txg_wait_synced(zilog->zl_dmu_pool, 0); } else if (wtxg != 0) { txg_wait_synced(zilog->zl_dmu_pool, wtxg); } zil_free_commit_waiter(zcw); } /* * Called in syncing context to free committed log blocks and update log header. */ void zil_sync(zilog_t *zilog, dmu_tx_t *tx) { zil_header_t *zh = zil_header_in_syncing_context(zilog); uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = zilog->zl_spa; uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; lwb_t *lwb; /* * We don't zero out zl_destroy_txg, so make sure we don't try * to destroy it twice. */ if (spa_sync_pass(spa) != 1) return; zil_lwb_flush_wait_all(zilog, txg); mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); if (*replayed_seq != 0) { ASSERT(zh->zh_replay_seq < *replayed_seq); zh->zh_replay_seq = *replayed_seq; *replayed_seq = 0; } if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); ASSERT(list_is_empty(&zilog->zl_lwb_list)); memset(zh, 0, sizeof (zil_header_t)); memset(zilog->zl_replayed_seq, 0, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* * If this block was part of log chain that couldn't * be claimed because a device was missing during * zil_claim(), but that device later returns, * then this block could erroneously appear valid. * To guard against this, assign a new GUID to the new * log chain so it doesn't matter what blk points to. */ zil_init_log_chain(zilog, &blk); zh->zh_log = blk; } else { /* * A destroyed ZIL chain can't contain any TX_SETSAXATTR * records. So, deactivate the feature for this dataset. * We activate it again when we start a new ZIL chain. */ if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) dsl_dataset_deactivate_feature(ds, SPA_FEATURE_ZILSAXATTR, tx); } } while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_state != LWB_STATE_FLUSH_DONE || lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); if (!BP_IS_HOLE(&lwb->lwb_blk)) zio_free(spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); /* * If we don't have anything left in the lwb list then * we've had an allocation failure and we need to zero * out the zil_header blkptr so that we don't end * up freeing the same block twice. */ if (list_is_empty(&zilog->zl_lwb_list)) BP_ZERO(&zh->zh_log); } mutex_exit(&zilog->zl_lock); } static int zil_lwb_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; lwb_t *lwb = vbuf; list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } static void zil_lwb_dest(void *vbuf, void *unused) { (void) unused; lwb_t *lwb = vbuf; mutex_destroy(&lwb->lwb_vdev_lock); avl_destroy(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); list_destroy(&lwb->lwb_itxs); } void zil_init(void) { zil_lwb_cache = kmem_cache_create("zil_lwb_cache", sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); zil_zcw_cache = kmem_cache_create("zil_zcw_cache", sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zil_sums_init(&zil_sums_global); zil_kstats_global = kstat_create("zfs", 0, "zil", "misc", KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zil_kstats_global != NULL) { zil_kstats_global->ks_data = &zil_stats; zil_kstats_global->ks_update = zil_kstats_global_update; zil_kstats_global->ks_private = NULL; kstat_install(zil_kstats_global); } } void zil_fini(void) { kmem_cache_destroy(zil_zcw_cache); kmem_cache_destroy(zil_lwb_cache); if (zil_kstats_global != NULL) { kstat_delete(zil_kstats_global); zil_kstats_global = NULL; } zil_sums_fini(&zil_sums_global); } void zil_set_sync(zilog_t *zilog, uint64_t sync) { zilog->zl_sync = sync; } void zil_set_logbias(zilog_t *zilog, uint64_t logbias) { zilog->zl_logbias = logbias; } zilog_t * zil_alloc(objset_t *os, zil_header_t *zh_phys) { zilog_t *zilog; zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); zilog->zl_header = zh_phys; zilog->zl_os = os; zilog->zl_spa = dmu_objset_spa(os); zilog->zl_dmu_pool = dmu_objset_pool(os); zilog->zl_destroy_txg = TXG_INITIAL - 1; zilog->zl_logbias = dmu_objset_logbias(os); zilog->zl_sync = dmu_objset_syncprop(os); zilog->zl_dirty_max_txg = 0; zilog->zl_last_lwb_opened = NULL; zilog->zl_last_lwb_latency = 0; zilog->zl_max_block_size = zil_maxblocksize; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL); for (int i = 0; i < TXG_SIZE; i++) { mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, MUTEX_DEFAULT, NULL); } list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), offsetof(itx_t, itx_node)); cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL); return (zilog); } void zil_free(zilog_t *zilog) { int i; zilog->zl_stop_sync = 1; ASSERT0(zilog->zl_suspend); ASSERT0(zilog->zl_suspending); ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); list_destroy(&zilog->zl_itx_commit_list); for (i = 0; i < TXG_SIZE; i++) { /* * It's possible for an itx to be generated that doesn't dirty * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() * callback to remove the entry. We remove those here. * * Also free up the ziltest itxs. */ if (zilog->zl_itxg[i].itxg_itxs) zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); mutex_destroy(&zilog->zl_itxg[i].itxg_lock); } mutex_destroy(&zilog->zl_issuer_lock); mutex_destroy(&zilog->zl_lock); mutex_destroy(&zilog->zl_lwb_io_lock); cv_destroy(&zilog->zl_cv_suspend); cv_destroy(&zilog->zl_lwb_io_cv); kmem_free(zilog, sizeof (zilog_t)); } /* * Open an intent log. */ zilog_t * zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums) { zilog_t *zilog = dmu_objset_zil(os); ASSERT3P(zilog->zl_get_data, ==, NULL); ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; zilog->zl_sums = zil_sums; return (zilog); } /* * Close an intent log. */ void zil_close(zilog_t *zilog) { lwb_t *lwb; uint64_t txg; if (!dmu_objset_is_snapshot(zilog->zl_os)) { zil_commit(zilog, 0); } else { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT0(zilog->zl_dirty_max_txg); ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); } mutex_enter(&zilog->zl_lock); txg = zilog->zl_dirty_max_txg; lwb = list_tail(&zilog->zl_lwb_list); if (lwb != NULL) { txg = MAX(txg, lwb->lwb_alloc_txg); txg = MAX(txg, lwb->lwb_max_txg); } mutex_exit(&zilog->zl_lock); /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in * zil_lwb_write_issue(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); mutex_exit(&zilog->zl_lwb_io_lock); /* * We need to use txg_wait_synced() to wait until that txg is synced. * zil_sync() will guarantee all lwbs up to that txg have been * written out, flushed, and cleaned. */ if (txg != 0) txg_wait_synced(zilog->zl_dmu_pool, txg); if (zilog_is_dirty(zilog)) zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, (u_longlong_t)txg); if (txg < spa_freeze_txg(zilog->zl_spa)) VERIFY(!zilog_is_dirty(zilog)); zilog->zl_get_data = NULL; /* * We should have only one lwb left on the list; remove it now. */ mutex_enter(&zilog->zl_lock); lwb = list_remove_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zil_free_lwb(zilog, lwb); } mutex_exit(&zilog->zl_lock); } static const char *suspend_tag = "zil suspending"; /* * Suspend an intent log. While in suspended mode, we still honor * synchronous semantics, but we rely on txg_wait_synced() to do it. * On old version pools, we suspend the log briefly when taking a * snapshot so that it will have an empty intent log. * * Long holds are not really intended to be used the way we do here -- * held for such a short time. A concurrent caller of dsl_dataset_long_held() * could fail. Therefore we take pains to only put a long hold if it is * actually necessary. Fortunately, it will only be necessary if the * objset is currently mounted (or the ZVOL equivalent). In that case it * will already have a long hold, so we are not really making things any worse. * * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or * zvol_state_t), and use their mechanism to prevent their hold from being * dropped (e.g. VFS_HOLD()). However, that would be even more pain for * very little gain. * * if cookiep == NULL, this does both the suspend & resume. * Otherwise, it returns with the dataset "long held", and the cookie * should be passed into zil_resume(). */ int zil_suspend(const char *osname, void **cookiep) { objset_t *os; zilog_t *zilog; const zil_header_t *zh; int error; error = dmu_objset_hold(osname, suspend_tag, &os); if (error != 0) return (error); zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); zh = zilog->zl_header; if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (SET_ERROR(EBUSY)); } /* * Don't put a long hold in the cases where we can avoid it. This * is when there is no cookie so we are doing a suspend & resume * (i.e. called from zil_vdev_offline()), and there's nothing to do * for the suspend because it's already suspended, or there's no ZIL. */ if (cookiep == NULL && !zilog->zl_suspending && (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (0); } dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); dsl_pool_rele(dmu_objset_pool(os), suspend_tag); zilog->zl_suspend++; if (zilog->zl_suspend > 1) { /* * Someone else is already suspending it. * Just wait for them to finish. */ while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } /* * If there is no pointer to an on-disk block, this ZIL must not * be active (e.g. filesystem not mounted), so there's nothing * to clean up. */ if (BP_IS_HOLE(&zh->zh_log)) { ASSERT(cookiep != NULL); /* fast path already handled */ *cookiep = os; mutex_exit(&zilog->zl_lock); return (0); } /* * The ZIL has work to do. Ensure that the associated encryption * key will remain mapped while we are committing the log by * grabbing a reference to it. If the key isn't loaded we have no * choice but to return an error until the wrapping key is loaded. */ if (os->os_encrypted && dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) { zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); return (SET_ERROR(EACCES)); } zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); /* * We need to use zil_commit_impl to ensure we wait for all * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed * to disk before proceeding. If we used zil_commit instead, it * would just call txg_wait_synced(), because zl_suspend is set. * txg_wait_synced() doesn't wait for these lwb's to be * LWB_STATE_FLUSH_DONE before returning. */ zil_commit_impl(zilog, 0); /* * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we * use txg_wait_synced() to ensure the data from the zilog has * migrated to the main pool before calling zil_destroy(). */ txg_wait_synced(zilog->zl_dmu_pool, 0); zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); zilog->zl_suspending = B_FALSE; cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); if (os->os_encrypted) dsl_dataset_remove_key_mapping(dmu_objset_ds(os)); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } void zil_resume(void *cookie) { objset_t *os = cookie; zilog_t *zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_suspend != 0); zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); } typedef struct zil_replay_arg { zil_replay_func_t *const *zr_replay; void *zr_arg; boolean_t zr_byteswap; char *zr_lr; } zil_replay_arg_t; static int zil_replay_error(zilog_t *zilog, const lr_t *lr, int error) { char name[ZFS_MAX_DATASET_NAME_LEN]; zilog->zl_replaying_seq--; /* didn't actually replay this one */ dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)(lr->lrc_txtype & ~TX_CI), (lr->lrc_txtype & TX_CI) ? "CI" : ""); return (error); } static int zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; int error = 0; zilog->zl_replaying_seq = lr->lrc_seq; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return (0); if (lr->lrc_txg < claim_txg) /* already committed */ return (0); /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; if (txtype == 0 || txtype >= TX_MAX_TYPE) return (zil_replay_error(zilog, lr, EINVAL)); /* * If this record type can be logged out of order, the object * (lr_foid) may no longer exist. That's legitimate, not an error. */ if (TX_OOO(txtype)) { error = dmu_object_info(zilog->zl_os, LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL); if (error == ENOENT || error == EEXIST) return (0); } /* * Make a copy of the data so we can revise and extend it. */ memcpy(zr->zr_lr, lr, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. */ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { error = zil_read_log_data(zilog, (lr_write_t *)lr, zr->zr_lr + reclen); if (error != 0) return (zil_replay_error(zilog, lr, error)); } /* * The log block containing this lr may have been byteswapped * so that we can easily examine common fields like lrc_txtype. * However, the log is a mix of different record types, and only the * replay vectors know how to byteswap their records. Therefore, if * the lr was byteswapped, undo it before invoking the replay vector. */ if (zr->zr_byteswap) byteswap_uint64_array(zr->zr_lr, reclen); /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number * is updated if we are in replay mode. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); if (error != 0) { /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with * EEXIST. So if we receive any error we try syncing out * any removes then retry the transaction. Note that we * specify B_FALSE for byteswap now, so we don't do it twice. */ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); if (error != 0) return (zil_replay_error(zilog, lr, error)); } return (0); } static int zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) { (void) bp, (void) arg, (void) claim_txg; zilog->zl_replay_blks++; return (0); } /* * If this dataset has a non-empty intent log, replay it and destroy it. * Return B_TRUE if there were any entries to replay. */ boolean_t zil_replay(objset_t *os, void *arg, zil_replay_func_t *const replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { return (zil_destroy(zilog, B_TRUE)); } zr.zr_replay = replay_func; zr.zr_arg = arg; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* * Wait for in-progress removes to sync before starting replay. */ txg_wait_synced(zilog->zl_dmu_pool, 0); zilog->zl_replay = B_TRUE; zilog->zl_replay_time = ddi_get_lbolt(); ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg, B_TRUE); vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_replay = B_FALSE; return (B_TRUE); } boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx) { if (zilog->zl_sync == ZFS_SYNC_DISABLED) return (B_TRUE); if (zilog->zl_replay) { dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = zilog->zl_replaying_seq; return (B_TRUE); } return (B_FALSE); } int zil_reset(const char *osname, void *arg) { (void) arg; int error = zil_suspend(osname, NULL); /* EACCES means crypto key not loaded */ if ((error == EACCES) || (error == EBUSY)) return (SET_ERROR(error)); if (error != 0) return (SET_ERROR(EEXIST)); return (0); } EXPORT_SYMBOL(zil_alloc); EXPORT_SYMBOL(zil_free); EXPORT_SYMBOL(zil_open); EXPORT_SYMBOL(zil_close); EXPORT_SYMBOL(zil_replay); EXPORT_SYMBOL(zil_replaying); EXPORT_SYMBOL(zil_destroy); EXPORT_SYMBOL(zil_destroy_sync); EXPORT_SYMBOL(zil_itx_create); EXPORT_SYMBOL(zil_itx_destroy); EXPORT_SYMBOL(zil_itx_assign); EXPORT_SYMBOL(zil_commit); EXPORT_SYMBOL(zil_claim); EXPORT_SYMBOL(zil_check_log_chain); EXPORT_SYMBOL(zil_sync); EXPORT_SYMBOL(zil_clean); EXPORT_SYMBOL(zil_suspend); EXPORT_SYMBOL(zil_resume); EXPORT_SYMBOL(zil_lwb_add_block); EXPORT_SYMBOL(zil_bp_tree_add); EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); EXPORT_SYMBOL(zil_sums_init); EXPORT_SYMBOL(zil_sums_fini); EXPORT_SYMBOL(zil_kstat_values_update); ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW, "ZIL block open timeout percentage"); ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW, "Minimum delay we care for ZIL block commit"); ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, "Disable intent logging replay"); ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW, "Disable ZIL cache flushes"); ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW, "Limit in bytes slog sync writes per commit"); ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW, "Limit in bytes of ZIL log block size"); ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW, "Limit in bytes WR_COPIED size"); diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 3b3b40fa73d8..a719e5492323 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -1,5169 +1,5158 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, Datto, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *const zio_type_name[ZIO_TYPES] = { /* * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim" }; int zio_dva_throttle_enabled = B_TRUE; static int zio_deadman_log_all = B_FALSE; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ static kmem_cache_t *zio_cache; static kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif /* Mark IOs as "slow" if they take longer than 30 seconds */ static uint_t zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. * * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable * compression (including of metadata). In practice, we don't have this * many sync passes, so this has no effect. * * The original intent was that disabling compression would help the sync * passes to converge. However, in practice disabling compression increases * the average number of sync passes, because when we turn compression off, a * lot of block's size will change and thus we have to re-allocate (not * overwrite) them. It also increases the number of 128KB allocations (e.g. * for indirect blocks and spacemaps) because these will not be compressed. * The 128K allocations are especially detrimental to performance on highly * fragmented systems, which may have very few free segments of this size, * and may need to load new metaslabs to satisfy 128K allocations. */ /* defer frees starting in this pass */ uint_t zfs_sync_pass_deferred_free = 2; /* don't compress starting in this pass */ static uint_t zfs_sync_pass_dont_compress = 8; /* rewrite new bps starting in this pass */ static uint_t zfs_sync_pass_rewrite = 2; /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) /* * Enable smaller cores by excluding metadata * allocations as well. */ int zio_exclude_metadata = 0; static int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG static const int zio_buf_debug_limit = 16384; #else static const int zio_buf_debug_limit = 0; #endif static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); void zio_init(void) { size_t c; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - /* - * For small buffers, we want a cache for each multiple of - * SPA_MINBLOCKSIZE. For larger buffers, we want a cache - * for each quarter-power of 2. - */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; - size_t p2 = size; - size_t align = 0; - size_t data_cflags, cflags; - - data_cflags = KMC_NODEBUG; - cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? - KMC_NODEBUG : 0; + size_t align, cflags, data_cflags; + char name[32]; + /* + * Create cache for each half-power of 2 size, starting from + * SPA_MINBLOCKSIZE. It should give us memory space efficiency + * of ~7/8, sufficient for transient allocations mostly using + * these caches. + */ + size_t p2 = size; while (!ISP2(p2)) p2 &= p2 - 1; + if (!IS_P2ALIGNED(size, p2 / 2)) + continue; #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; - /* - * Here's the problem - on 4K native devices in userland on - * Linux using O_DIRECT, buffers must be 4K aligned or I/O - * will fail with EINVAL, causing zdb (and others) to coredump. - * Since userland probably doesn't need optimized buffer caches, - * we just force 4K alignment on everything. - */ - align = 8 * SPA_MINBLOCKSIZE; -#else - if (size < PAGESIZE) { - align = SPA_MINBLOCKSIZE; - } else if (IS_P2ALIGNED(size, p2 >> 2)) { - align = PAGESIZE; - } #endif - if (align != 0) { - char name[36]; - if (cflags == data_cflags) { - /* - * Resulting kmem caches would be identical. - * Save memory by creating only one. - */ - (void) snprintf(name, sizeof (name), - "zio_buf_comb_%lu", (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, - size, align, NULL, NULL, NULL, NULL, NULL, - cflags); - zio_data_buf_cache[c] = zio_buf_cache[c]; - continue; - } - (void) snprintf(name, sizeof (name), "zio_buf_%lu", - (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, cflags); - - (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", - (ulong_t)size); - zio_data_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, data_cflags); + if (IS_P2ALIGNED(size, PAGESIZE)) + align = PAGESIZE; + else + align = 1 << (highbit64(size ^ (size - 1)) - 1); + + cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? + KMC_NODEBUG : 0; + data_cflags = KMC_NODEBUG; + if (cflags == data_cflags) { + /* + * Resulting kmem caches would be identical. + * Save memory by creating only one. + */ + (void) snprintf(name, sizeof (name), + "zio_buf_comb_%lu", (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, cflags); + zio_data_buf_cache[c] = zio_buf_cache[c]; + continue; } + (void) snprintf(name, sizeof (name), "zio_buf_%lu", + (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, cflags); + + (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", + (ulong_t)size); + zio_data_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, data_cflags); } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } zio_inject_init(); lz4_init(); } void zio_fini(void) { size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; #if defined(ZFS_DEBUG) && !defined(_KERNEL) for (size_t i = 0; i < n; i++) { if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i]) (void) printf("zio_fini: [%d] %llu != %llu\n", (int)((i + 1) << SPA_MINBLOCKSHIFT), (long long unsigned)zio_buf_cache_allocs[i], (long long unsigned)zio_buf_cache_frees[i]); } #endif /* * The same kmem cache can show up multiple times in both zio_buf_cache * and zio_data_buf_cache. Do a wasteful but trivially correct scan to * sort it out. */ for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_buf_cache[j]) zio_buf_cache[j] = NULL; if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_data_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { VERIFY3P(zio_buf_cache[i], ==, NULL); VERIFY3P(zio_data_buf_cache[i], ==, NULL); } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); lz4_fini(); } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif kmem_cache_free(zio_buf_cache[c], buf); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); kmem_cache_free(zio_data_buf_cache[c], buf); } static void zio_abd_free(void *abd, size_t size) { (void) size; abd_free((abd_t *)abd); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ void zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_abd = data; zio->io_size = size; } void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) abd_free(zio->io_abd); zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks, decompression, and decryption * ========================================================================== */ static void zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) abd_copy(data, zio->io_abd, size); } static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { void *tmp = abd_borrow_buf(data, size); int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_abd, tmp, zio->io_size, size, &zio->io_prop.zp_complevel); abd_return_buf_copy(data, tmp, size); if (zio_injection_enabled && ret == 0) ret = zio_handle_fault_injection(zio, EINVAL); if (ret != 0) zio->io_error = SET_ERROR(EIO); } } static void zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) { int ret; void *tmp; blkptr_t *bp = zio->io_bp; spa_t *spa = zio->io_spa; uint64_t dsobj = zio->io_bookmark.zb_objset; uint64_t lsize = BP_GET_LSIZE(bp); dmu_object_type_t ot = BP_GET_TYPE(bp); uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(size, !=, 0); if (zio->io_error != 0) return; /* * Verify the cksum of MACs stored in an indirect bp. It will always * be possible to verify this since it does not require an encryption * key. */ if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { zio_crypt_decode_mac_bp(bp, mac); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* * We haven't decompressed the data yet, but * zio_crypt_do_indirect_mac_checksum() requires * decompressed data to be able to parse out the MACs * from the indirect block. We decompress it now and * throw away the result after we are finished. */ tmp = zio_buf_alloc(lsize); ret = zio_decompress_data(BP_GET_COMPRESS(bp), zio->io_abd, tmp, zio->io_size, lsize, &zio->io_prop.zp_complevel); if (ret != 0) { ret = SET_ERROR(EIO); goto error; } ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); zio_buf_free(tmp, lsize); } else { ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); } abd_copy(data, zio->io_abd, size); if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } if (ret != 0) goto error; return; } /* * If this is an authenticated block, just check the MAC. It would be * nice to separate this out into its own flag, but when this was done, * we had run out of bits in what is now zio_flag_t. Future cleanup * could make this a flag bit. */ if (BP_IS_AUTHENTICATED(bp)) { if (ot == DMU_OT_OBJSET) { ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); } else { zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, mac); if (zio_injection_enabled && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } } abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; } zio_crypt_decode_params_bp(bp, salt, iv); if (ot == DMU_OT_INTENT_LOG) { tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmp, mac); abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, mac); } ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, zio->io_abd, &no_crypt); if (no_crypt) abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; error: /* assert that the key was found unless this was speculative */ ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); /* * If there was a decryption / authentication error return EIO as * the io_error. If this was not a speculative zio, create an ereport. */ if (ret == ECKSUM) { zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark, &zio->io_bp->blk_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; } } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ zio_t * zio_walk_parents(zio_t *cio, zio_link_t **zl) { list_t *pl = &cio->io_parent_list; *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_child == cio); return ((*zl)->zl_parent); } zio_t * zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; ASSERT(MUTEX_HELD(&pio->io_lock)); *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_parent == pio); return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(cio, &zl); VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } void zio_add_child_first(zio_t *pio, zio_t *cio) { /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; ASSERT(list_is_empty(&cio->io_parent_list)); list_insert_head(&cio->io_parent_list, zl); mutex_enter(&pio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); mutex_exit(&pio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) { boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); for (int c = 0; c < ZIO_CHILD_TYPES; c++) { if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) continue; uint64_t *countp = &zio->io_children[c][wait]; if (*countp != 0) { zio->io_stage >>= 1; ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; break; } } mutex_exit(&zio->io_lock); return (waiting); } __attribute__((always_inline)) static inline void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, zio_t **next_to_executep) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { zio_taskq_type_t type = pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); /* * If we can tell the caller to execute this parent next, do * so. We only do this if the parent's zio type matches the * child's type. Otherwise dispatch the parent zio in its * own taskq. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch * overhead, and has no recursion penalty. Note that one * read from disk typically causes at least 3 zio's: a * zio_null(), the logical zio_read(), and then a physical * zio. When the physical ZIO completes, we are able to call * zio_done() on all 3 of these zio's from one invocation of * zio_execute() by returning the parent back to * zio_execute(). Since the parent isn't executed until this * thread returns back to zio_execute(), the caller should do * so promptly. * * In other cases, dispatching the parent prevents * overflowing the stack when we have deeply nested * parent-child relationships, as we do with the "mega zio" * of writes for spa_sync(), and the chain of ZIL blocks. */ if (next_to_executep != NULL && *next_to_executep == NULL && pio->io_type == zio->io_type) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); } } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } int zio_bookmark_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) return (-1); if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) return (1); if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) return (-1); if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) return (1); if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) return (-1); if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) return (1); if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) return (-1); if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, zio_flag_t flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); memset(zio, 0, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) { zio->io_bp_copy = *bp; zio->io_bp = &zio->io_bp_copy; /* so caller can free */ } else { zio->io_bp = (blkptr_t *)bp; } zio->io_bp_orig = *bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child_first(pio, zio); } taskq_init_ent(&zio->io_tqent); return (zio); } void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } static int zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, enum blk_verify_flag blk_verify, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("bad blkptr at %px: " "DVA[0]=%#llx/%#llx " "DVA[1]=%#llx/%#llx " "DVA[2]=%#llx/%#llx " "prop=%#llx " "pad=%#llx,%#llx " "phys_birth=%#llx " "birth=%#llx " "fill=%#llx " "cksum=%#llx/%#llx/%#llx/%#llx", bp, (long long)bp->blk_dva[0].dva_word[0], (long long)bp->blk_dva[0].dva_word[1], (long long)bp->blk_dva[1].dva_word[0], (long long)bp->blk_dva[1].dva_word[1], (long long)bp->blk_dva[2].dva_word[0], (long long)bp->blk_dva[2].dva_word[1], (long long)bp->blk_prop, (long long)bp->blk_pad[0], (long long)bp->blk_pad[1], (long long)bp->blk_phys_birth, (long long)bp->blk_birth, (long long)bp->blk_fill, (long long)bp->blk_cksum.zc_word[0], (long long)bp->blk_cksum.zc_word[1], (long long)bp->blk_cksum.zc_word[2], (long long)bp->blk_cksum.zc_word[3]); switch (blk_verify) { case BLK_VERIFY_HALT: zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: zfs_dbgmsg("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_ONLY: break; } return (1); } /* * Verify the block pointer fields contain reasonable values. This means * it only contains known object types, checksum/compression identifiers, * block sizes within the maximum allowed limits, valid DVAs, etc. * * If everything checks out B_TRUE is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * * Values for blk_verify_flag: * BLK_VERIFY_ONLY: evaluate the block * BLK_VERIFY_LOG: evaluate the block and log problems * BLK_VERIFY_HALT: call zfs_panic_recover on error * * Values for blk_config_flag: * BLK_CONFIG_HELD: caller holds SCL_VDEV for writer * BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be * obtained for reader * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better * performance */ boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { int errors = 0; if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } /* * Do not verify individual DVAs if the config is not trusted. This * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (!spa->spa_trust_config) return (errors == 0); switch (blk_config) { case BLK_CONFIG_HELD: ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); break; case BLK_CONFIG_NEEDED: spa_config_enter(spa, SCL_VDEV, bp, RW_READER); break; case BLK_CONFIG_SKIP: return (errors == 0); default: panic("invalid blk_config %u", blk_config); } /* * Pool-specific checks. * * Note: it would be nice to verify that the blk_birth and * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() * allows the birth time of log blocks (and dmu_sync()-ed blocks * that are in the log) to be arbitrarily large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { const dva_t *dva = &bp->blk_dva[i]; uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } if (blk_config == BLK_CONFIG_NEEDED) spa_config_exit(spa, SCL_VDEV, bp); return (errors == 0); } boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) { (void) bp; uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) return (B_FALSE); vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) return (B_FALSE); if (vd->vdev_ops == &vdev_hole_ops) return (B_FALSE); if (vd->vdev_ops == &vdev_missing_ops) { return (B_FALSE); } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) return (B_FALSE); return (B_TRUE); } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_children_ready = children_ready; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). Encrypted * dedup blocks need data as well so we also disable dedup in this * case. */ if (data == NULL && (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite, boolean_t brtwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); ASSERT(!brtwrite || !nopwrite); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_brtwrite = brtwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. * * Note that we only defer frees after zfs_sync_pass_deferred_free * when the log space map feature is disabled. [see relevant comment * in spa_sync_iterate_to_convergence()] */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) || brt_maybe_exists(spa, bp)) { metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); } } /* * To improve performance, this function may return NULL if we were able * to do the free immediately. This avoids the cost of creating a zio * (and linking it to the parent, etc). */ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_flag_t flags) { ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); if (BP_IS_EMBEDDED(bp)) return (NULL); metaslab_check_free(spa, bp); arc_freed(spa, bp); dsl_scan_freed(spa, bp); if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || brt_maybe_exists(spa, bp)) { /* * GANG, DEDUP and BRT blocks can induce a read (for the gang * block header, the DDT or the BRT), so issue them * asynchronously so that this thread is not tied up. */ enum zio_stage stage = ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage)); } else { metaslab_free(spa, bp, txg, B_FALSE); return (NULL); } } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ? BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); } zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, done, private, flags)); } return (zio); } zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, enum trim_flag trim_flags) { zio_t *zio; ASSERT0(vd->vdev_children); ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift)); ASSERT3U(size, !=, 0); zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done, private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE); zio->io_trim_flags = trim_flags; return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ abd_t *wbuf = abd_alloc_sametype(data, size); abd_copy(wbuf, data, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; /* * vdev child I/Os do not propagate their error to the parent. * Therefore, for correct operation the caller *must* check for * and handle the error in the child i/o's done callback. * The only exceptions are i/os that we don't care about * (OPTIONAL or REPAIR). */ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || done != NULL); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { ASSERT0(vd->vdev_children); offset += VDEV_LABEL_START_SIZE; } flags |= ZIO_VDEV_CHILD_FLAGS(pio); /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; /* * If we're creating a child I/O that is not associated with a * top-level vdev, then the child zio is not an allocating I/O. * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); flags &= ~ZIO_FLAG_IO_ALLOCATING; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, zio_type_t type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT3P(zio->io_executor, ==, NULL); ASSERT3U(zio->io_orig_size, ==, zio->io_size); ASSERT3U(size, <=, zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) { /* we are not doing a raw write */ ASSERT3U(zio->io_size, ==, zio->io_lsize); zio->io_orig_size = zio->io_size = zio->io_lsize = size; } } /* * Round provided allocation size up to a value that can be allocated * by at least some vdev(s) in the pool with minimum or no additional * padding and without extra space usage on others */ static uint64_t zio_roundup_alloc_size(spa_t *spa, uint64_t size) { if (size > spa->spa_min_alloc) return (roundup(size, spa->spa_gcd_alloc)); return (spa->spa_min_alloc); } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static zio_t * zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || BP_HAS_INDIRECT_MAC_CKSUM(bp)) && zio->io_child_type == ZIO_CHILD_LOGICAL) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decrypt); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (zio); } static zio_t * zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) return (zio); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; ASSERT(bp->blk_birth != zio->io_txg); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zp->zp_brtwrite) return (zio); ASSERT(!BP_GET_DEDUP(zio->io_bp_override)); if (BP_IS_EMBEDDED(bp)) return (zio); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (zio); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (zio); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && !zp->zp_encrypt) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (zio); } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } return (zio); } static zio_t * zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; uint32_t pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { return (NULL); } if (!IO_IS_ALLOCATING(zio)) return (zio); if (zio->io_children_ready != NULL) { /* * Now that all our children are ready, run the callback * associated with this zio in case it wants to modify the * data to be written. */ ASSERT3U(zp->zp_level, >, 0); zio->io_children_ready(zio); } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) || MIN(zp->zp_copies, spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = NULL; psize = zio_compress_data(compress, zio->io_abd, &cbuf, lsize, zp->zp_complevel); if (psize == 0) { compress = ZIO_COMPRESS_OFF; } else if (psize >= lsize) { compress = ZIO_COMPRESS_OFF; if (cbuf != NULL) zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (zio); } else { /* * Round compressed size up to the minimum allocation * size of the smallest-ashift device, and zero the * tail. This ensures that the compressed size of the * BP (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ size_t rounded = (size_t)zio_roundup_alloc_size(spa, psize); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); psize = lsize; } else { abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_take_ownership_of_buf(cdata, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cdata, psize, lsize, NULL); } } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && zp->zp_type == DMU_OT_DNODE) { /* * The DMU actually relies on the zio layer's compression * to free metadnode blocks that have had all contained * dnodes freed. As a result, even when doing a raw * receive, we must check whether the block can be compressed * to a hole. */ psize = zio_compress_data(ZIO_COMPRESS_EMPTY, zio->io_abd, NULL, lsize, zp->zp_complevel); if (psize == 0 || psize >= lsize) compress = ZIO_COMPRESS_OFF; } else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) { /* * If we are raw receiving an encrypted dataset we should not * take this codepath because it will change the on-disk block * and decryption will fail. */ size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize), lsize); if (rounded != psize) { abd_t *cdata = abd_alloc_linear(rounded, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); abd_copy_off(cdata, zio->io_abd, 0, 0, psize); psize = rounded; zio_push_transform(zio, cdata, psize, rounded, NULL); } } else { ASSERT3U(psize, !=, 0); } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (zio->io_bp_orig.blk_birth != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!zp->zp_encrypt || DMU_OT_IS_ENCRYPTED(zp->zp_type)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (zio); } static zio_t * zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); return (zio); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available. */ if ((zio->io_priority == ZIO_PRIORITY_NOW || zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags, &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { spa_t *spa = zio->io_spa; taskq_t *tq = taskq_of_curthread(); for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (tqs->stqs_taskq[i] == tq) return (B_TRUE); } } return (B_FALSE); } static zio_t * zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } void zio_interrupt(void *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } void zio_delay_interrupt(zio_t *zio) { /* * The timeout_generic() function isn't defined in userspace, so * rather than trying to implement the function, the zio delay * functionality has been disabled for userspace builds. */ #ifdef _KERNEL /* * If io_target_timestamp is zero, then no delay has been registered * for this IO, thus jump to the end of this function and "skip" the * delay; issuing it directly to the zio layer. */ if (zio->io_target_timestamp != 0) { hrtime_t now = gethrtime(); if (now >= zio->io_target_timestamp) { /* * This IO has already taken longer than the target * delay to complete, so we don't want to delay it * any longer; we "miss" the delay and issue it * directly to the zio layer. This is likely due to * the target latency being set to a value less than * the underlying hardware can satisfy (e.g. delay * set to 1ms, but the disks take 10ms to complete an * IO request). */ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, hrtime_t, now); zio_interrupt(zio); } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; clock_t expire_at_tick = ddi_get_lbolt() + NSEC_TO_TICK(diff); DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); if (NSEC_TO_TICK(diff) == 0) { /* Our delay is less than a jiffy - just spin */ zfs_sleep_until(zio->io_target_timestamp); zio_interrupt(zio); } else { /* * Use taskq_dispatch_delay() in the place of * OpenZFS's timeout_generic(). */ tid = taskq_dispatch_delay(system_taskq, zio_interrupt, zio, TQ_NOSLEEP, expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just * finish the zio without a delay. */ zio_interrupt(zio); } } } return; } #endif DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); zio_interrupt(zio); } static void zio_deadman_impl(zio_t *pio, int ziodepth) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; vdev_t *vd = pio->io_vd; if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) { vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL; zbookmark_phys_t *zb = &pio->io_bookmark; uint64_t delta = gethrtime() - pio->io_timestamp; uint64_t failmode = spa_get_deadman_failmode(pio->io_spa); zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu " "delta=%llu queued=%llu io=%llu " "path=%s " "last=%llu type=%d " "priority=%d flags=0x%llx stage=0x%x " "pipeline=0x%x pipeline-trace=0x%x " "objset=%llu object=%llu " "level=%llu blkid=%llu " "offset=%llu size=%llu " "error=%d", ziodepth, pio, pio->io_timestamp, (u_longlong_t)delta, pio->io_delta, pio->io_delay, vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, pio->io_type, pio->io_priority, (u_longlong_t)pio->io_flags, pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size, pio->io_error); (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, pio->io_spa, vd, zb, pio, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { zio_interrupt(pio); } } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_deadman_impl(cio, ziodepth + 1); } mutex_exit(&pio->io_lock); } /* * Log the critical information describing this zio and all of its children * using the zfs_dbgmsg() interface then post deadman event for the ZED. */ void zio_deadman(zio_t *pio, const char *tag) { spa_t *spa = pio->io_spa; char *name = spa_name(spa); if (!zfs_deadman_enabled || spa_suspended(spa)) return; zio_deadman_impl(pio, 0); switch (spa_get_deadman_failmode(spa)) { case ZIO_FAILURE_MODE_WAIT: zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_CONTINUE: zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_PANIC: fm_panic("%s determined I/O to pool '%s' is hung.", tag, name); break; } } /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for * dependent child I/Os; (3) the I/O issues, so we're waiting * for an I/O completion interrupt; (4) the I/O is delegated by * vdev-level caching or aggregation; (5) the I/O is deferred * due to vdev-level queueing; (6) the I/O is handed off to * another thread. In all cases, the pipeline stops whenever * there's no CPU work; it never burns a thread in cv_wait_io(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; /* * zio_execute() is a wrapper around the static function * __zio_execute() so that we can force __zio_execute() to be * inlined. This reduces stack overhead which is important * because __zio_execute() is called recursively in several zio * code paths. zio_execute() itself cannot be inlined because * it is externally visible. */ void zio_execute(void *zio) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); __zio_execute(zio); spl_fstrans_unmark(cookie); } /* * Used to determine if in the current context the stack is sized large * enough to allow zio_execute() to be called recursively. A minimum * stack size of 16K is required to avoid needing to re-dispatch the zio. */ static boolean_t zio_execute_stack_check(zio_t *zio) { #if !defined(HAVE_LARGE_STACKS) dsl_pool_t *dp = spa_get_dsl(zio->io_spa); /* Executing in txg_sync_thread() context. */ if (dp && curthread == dp->dp_tx.tx_sync_thread) return (B_TRUE); /* Pool initialization outside of zio_taskq context. */ if (dp && spa_is_initializing(dp->dp_spa) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) return (B_TRUE); #else (void) zio; #endif /* HAVE_LARGE_STACKS */ return (B_FALSE); } __attribute__((always_inline)) static inline void __zio_execute(zio_t *zio) { ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; zio->io_executor = curthread; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } /* * If the current context doesn't have large enough stacks * the zio must be issued asynchronously to prevent overflow. */ if (zio_execute_stack_check(zio)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; /* * The zio pipeline stage returns the next zio to execute * (typically the same as this one), or NULL if we should * stop. */ zio = zio_pipeline[highbit64(stage) - 1](zio); if (zio == NULL) return; } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { /* * Some routines, like zio_free_sync(), may return a NULL zio * to avoid the performance overhead of creating and then destroying * an unneeded zio. For the callers' simplicity, we accept a NULL * zio and ignore it. */ if (zio == NULL) return (0); long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); int error; ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); ASSERT3P(zio->io_executor, ==, NULL); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) { error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, ddi_get_lbolt() + timeout); if (zfs_deadman_enabled && error == -1 && gethrtime() - zio->io_queued_timestamp > spa_deadman_ziotime(zio->io_spa)) { mutex_exit(&zio->io_lock); timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms); zio_deadman(zio, FTAG); mutex_enter(&zio->io_lock); } } mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { /* * See comment in zio_wait(). */ if (zio == NULL) return; ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && list_is_empty(&zio->io_parent_list)) { zio_t *pio; /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE]; zio_add_child(pio, zio); } ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); } /* * ========================================================================== * Reexecute, cancel, or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(void *arg) { zio_t *pio = arg; zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ zio_link_t *zl = NULL; mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock); } mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on it. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); } } void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " "failure and has been suspended.\n", spa_name(spa)); (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = reason; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static void zio_gang_issue_func_done(zio_t *zio) { abd_free(zio->io_abd); } static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } static zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio; if (gn != NULL) { abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { abd_t *buf = abd_get_offset(data, offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); abd_free(buf); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio)); if (zio == NULL) { zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)); } return (zio); } static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(list_is_empty(&zio->io_child_list)); if (zio->io_error) return; /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); abd_free(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, offset); offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); } static zio_t * zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (zio); } static zio_t * zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 0); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; zio_t *gio __maybe_unused = zio->io_gang_leader; if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static void zio_write_gang_done(zio_t *zio) { /* * The io_abd field will be NULL for a zio with no data. The io_flags * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't * check for it here as it is cleared in zio_ready. */ if (zio->io_abd != NULL) abd_free(zio->io_abd); } static zio_t * zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* * If one copy was requested, store 2 copies of the GBH, so that we * can still traverse all the data (e.g. to free or scrub) even if a * block is damaged. Note that we can't store 3 copies of the GBH in * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt. */ int gbh_copies = copies; if (gbh_copies == 1) { gbh_copies = MIN(2, spa_max_replication(spa)); } int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator]. mca_alloc_slots, pio)); /* * The logical zio has already placed a reservation for * 'copies' allocation slots but gang blocks may require * additional copies. These additional copies * (i.e. gbh_copies - copies) are guaranteed to succeed * since metaslab_class_throttle_reserve() always allows * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio, pio->io_allocator); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * If we failed to allocate the gang block header then * we remove any additional allocation reservations that * we placed here. The original reservation will * be removed when the logical I/O goes to the ready * stage. */ metaslab_class_throttle_unreserve(mc, gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; return (pio); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; memset(gbh, 0, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. */ for (int g = 0; resid != 0; resid -= lsize, g++) { lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_complevel = gio->io_prop.zp_complevel; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, lsize, lsize, &zp, zio_write_gang_member_ready, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * Gang children won't throttle but we should * account for their work, so reserve an allocation * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, zp.zp_copies, cio->io_allocator, cio, flags)); } zio_nowait(cio); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio_nowait(zio); return (pio); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static zio_t * zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_IS_HOLE(bp)); ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (zio); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop); /* * If we're overwriting a block that is currently on an * indirect vdev, then ignore the nopwrite request and * allow a new block to be allocated on a concrete vdev. */ spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) { vdev_t *tvd = vdev_lookup_top(zio->io_spa, DVA_GET_VDEV(&bp_orig->blk_dva[d])); if (tvd->vdev_ops == &vdev_indirect_ops) { spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); return (zio); } } spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (zio); } /* * ========================================================================== * Block Reference Table * ========================================================================== */ static zio_t * zio_brt_free(zio_t *zio) { blkptr_t *bp; bp = zio->io_bp; if (BP_GET_LEVEL(bp) > 0 || BP_IS_METADATA(bp) || !brt_maybe_exists(zio->io_spa, bp)) { return (zio); } if (!brt_entry_decref(zio->io_spa, bp)) { /* * This isn't the last reference, so we cannot free * the data yet. */ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } return (zio); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ if (zio->io_error == 0 && dde->dde_repair_abd == NULL) dde->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } static zio_t * zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (ddp_self == NULL) return (zio); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (zio); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (zio); } static zio_t * zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (zio); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } if (dde->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (zio); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. * However, we should never get a raw, override zio so in these * cases we can compare the io_abd directly. This is useful because * it allows us to do dedup verification even if we don't have access * to the original data (for instance, if the encryption keys aren't * loaded). */ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; if (lio != NULL && do_raw) { return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); } else if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } } for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; if (ddp->ddp_phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) return (B_TRUE); ddt_exit(ddt); tmpabd = abd_alloc_for_io(psize, B_TRUE); error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_ready(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; if (zio->io_error) return; ddt_enter(ddt); ASSERT(dde->dde_lead_zio[p] == zio); ddt_phys_fill(ddp, zio->io_bp); zio_link_t *zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); } static void zio_ddt_child_write_done(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { zio_link_t *zl = NULL; while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); } ddt_exit(ddt); } static zio_t * zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; zio_t *cio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); } ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); if (dde->dde_lead_zio[p] != NULL) zio_add_child(zio, dde->dde_lead_zio[p]); else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { ASSERT(bp->blk_birth == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } ddt_exit(ddt); zio_nowait(cio); return (zio); } static ddt_entry_t *freedde; /* for debugging */ static zio_t * zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { ddp = ddt_phys_select(dde, bp); if (ddp) ddt_phys_decref(ddp); } ddt_exit(ddt); return (zio); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static zio_t * zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock)); zio = avl_first(&spa->spa_allocs[allocator].spaa_tree); if (zio == NULL) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, zio->io_prop.zp_copies, allocator, zio, 0)) { return (NULL); } avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); } static zio_t * zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; metaslab_class_t *mc; /* locate an appropriate allocation class */ mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (zio); } ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); zbookmark_phys_t *bm = &zio->io_bookmark; /* * We want to try to use as many allocators as possible to help improve * performance, but we also want logically adjacent IOs to be physically * adjacent to improve sequential read performance. We chunk each object * into 2^20 block regions, and then hash based on the objset, object, * level, and region to accomplish both of these goals. */ int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; zio->io_allocator = allocator; zio->io_metaslab_class = mc; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); avl_add(&spa->spa_allocs[allocator].spaa_tree, zio); nio = zio_io_to_allocate(spa, allocator); mutex_exit(&spa->spa_allocs[allocator].spaa_lock); return (nio); } static void zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); zio = zio_io_to_allocate(spa, allocator); mutex_exit(&spa->spa_allocs[allocator].spaa_lock); if (zio == NULL) return; ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); ASSERT0(zio->io_error); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; /* * if not already chosen, locate an appropriate allocation class */ mc = zio->io_metaslab_class; if (mc == NULL) { mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); zio->io_metaslab_class = mc; } /* * Try allocating the block in the usual metaslab class. * If that's full, allocate it in the normal class. * If that's full, allocate as a gang block, * and if all are full, the allocation fails (which shouldn't happen). * * Note that we do not fall back on embedded slog (ZIL) space, to * preserve unfragmented slog space, which is critical for decent * sync write performance. If a log allocation fails, we will fall * back to spa_sync() which is abysmal for performance. */ error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); /* * Fallback to normal class when an alloc class is full */ if (error == ENOSPC && mc != spa_normal_class(spa)) { /* * If throttling, transfer reservation over to normal class. * The io_allocator slot can remain the same even though we * are switching classes. */ if (mc->mc_alloc_throttle_enabled && (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { metaslab_class_throttle_unreserve(mc, zio->io_prop.zp_copies, zio->io_allocator, zio); zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; VERIFY(metaslab_class_throttle_reserve( spa_normal_class(spa), zio->io_prop.zp_copies, zio->io_allocator, zio, flags | METASLAB_MUST_RESERVE)); } zio->io_metaslab_class = mc = spa_normal_class(spa); if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying normal class: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); } if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying ganging: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } return (zio_write_gang_block(zio, mc)); } if (error != 0) { if (error != ENOSPC || (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) { zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " "size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } zio->io_error = error; } return (zio); } static zio_t * zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (zio); } static zio_t * zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (zio); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); /* * Block pointer fields are useful to metaslabs for stats and debugging. * Fill in the obvious ones before calling into metaslab_alloc(). */ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_PSIZE(new_bp, size); BP_SET_LEVEL(new_bp, 0); /* * When allocating a zil block, we don't have information about * the final destination of the block except the objset it's part * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ int flags = METASLAB_ZIL; int allocator = (uint_t)cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); *slog = (error == 0); if (error != 0) { error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); } if (error != 0) { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); } metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); /* * encrypted blocks will require an IV and salt. We generate * these now since we will not be rewriting the bp at * rewrite time. */ if (os->os_encrypted) { uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t salt[ZIO_DATA_SALT_LEN]; BP_SET_CRYPT(new_bp, B_TRUE); VERIFY0(spa_crypt_get_salt(spa, dmu_objset_id(os), salt)); VERIFY0(zio_crypt_generate_iv(iv)); zio_crypt_encode_params_bp(new_bp, salt, iv); } } else { zfs_dbgmsg("%s: zil block allocation failure: " "size %llu, error %d", spa_name(spa), (u_longlong_t)size, error); } return (error); } /* * ========================================================================== * Read and write to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static zio_t * zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; zio->io_delay = 0; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (NULL); } ASSERT3P(zio->io_logical, !=, zio); if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(spa->spa_trust_config); /* * Note: the code can handle other kinds of writes, * but we don't expect them. */ if (zio->io_vd->vdev_noalloc) { ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering. * * There are a few ways that we can end up creating these spurious * resilver i/os: * * 1. A resilver i/o will be issued if any DVA in the BP has a * dirty DTL. The mirror code will issue resilver writes to * each DVA, including the one(s) that are not on vdevs with dirty * DTLs. * * 2. With nested replication, which happens when we have a * "replacing" or "spare" vdev that's a child of a mirror or raidz. * For example, given mirror(replacing(A+B), C), it's likely that * only A is out of date (it's the new device). In this case, we'll * read from C, then use the data to resilver A+B -- but we don't * actually want to resilver B, just A. The top-level mirror has no * way to know this, so instead we just discard unnecessary repairs * as we work our way down the vdev tree. * * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. * The same logic applies to any form of nested replication: ditto * + mirror, RAID-Z + replacing, etc. * * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. * * Leaf DTL_PARTIAL can be empty when a legitimate write comes from * a dRAID spare vdev. For example, when a dRAID spare is first * used, its spare blocks need to be written to but the leaf vdev's * of such blocks can have empty DTL_PARTIAL. * * There seemed no clean way to allow such writes while bypassing * spurious ones. At this point, just avoid all bypassing for dRAID * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } /* * Select the next best leaf I/O to process. Distributed spares are * excluded since they dispatch the I/O directly to a leaf vdev after * applying the dRAID mapping. */ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (NULL); } zio->io_delay = gethrtime(); } vd->vdev_ops->vdev_op_io_start(zio); return (NULL); } static zio_t * zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error && vd->vdev_remove_wanted == B_FALSE) VERIFY(vdev_probe(vd, zio) == NULL); return (zio); } /* * This function is used to change the priority of an existing zio that is * currently in-flight. This is used by the arc to upgrade priority in the * event that a demand read is made for a block that is currently queued * as a scrub or async read IO. Otherwise, the high priority read request * would end up having to wait for the lower priority IO. */ void zio_change_priority(zio_t *pio, zio_priority_t priority) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { vdev_queue_change_io_priority(pio, priority); } else { pio->io_priority = priority; } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_change_priority(cio, priority); } mutex_exit(&pio->io_lock); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = abd; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_abd_free; } static zio_t * zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (NULL); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting " "cant_write=TRUE due to write failure with ENXIO", zio); vd->vdev_cant_write = B_TRUE; } /* * If a cache flush returns ENOTSUP or ENOTTY, we know that no future * attempts will ever succeed. In this case we set a persistent * boolean flag so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Encrypt and store encryption parameters * ========================================================================== */ /* * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for * managing the storage of encryption parameters and passing them to the * lower-level encryption functions. */ static zio_t * zio_encrypt(zio_t *zio) { zio_prop_t *zp = &zio->io_prop; spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_GET_PSIZE(bp); uint64_t dsobj = zio->io_bookmark.zb_objset; dmu_object_type_t ot = BP_GET_TYPE(bp); void *enc_buf = NULL; abd_t *eabd = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* the root zio already encrypted the data */ if (zio->io_child_type == ZIO_CHILD_GANG) return (zio); /* only ZIL blocks are re-encrypted on rewrite */ if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) return (zio); if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { BP_SET_CRYPT(bp, B_FALSE); return (zio); } /* if we are doing raw encryption set the provided encryption params */ if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { ASSERT0(BP_GET_LEVEL(bp)); BP_SET_CRYPT(bp, B_TRUE); BP_SET_BYTEORDER(bp, zp->zp_byteorder); if (ot != DMU_OT_OBJSET) zio_crypt_encode_mac_bp(bp, zp->zp_mac); /* dnode blocks must be written out in the provided byteorder */ if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && ot == DMU_OT_DNODE) { void *bswap_buf = zio_buf_alloc(psize); abd_t *babd = abd_get_from_buf(bswap_buf, psize); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); abd_copy_to_buf(bswap_buf, zio->io_abd, psize); dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, psize); abd_take_ownership_of_buf(babd, B_TRUE); zio_push_transform(zio, babd, psize, psize, NULL); } if (DMU_OT_IS_ENCRYPTED(ot)) zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); return (zio); } /* indirect blocks only maintain a cksum of the lower level MACs */ if (BP_GET_LEVEL(bp) > 0) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Objset blocks are a special case since they have 2 256-bit MACs * embedded within them. */ if (ot == DMU_OT_OBJSET) { ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); return (zio); } /* unencrypted object types are only authenticated with a MAC */ if (!DMU_OT_IS_ENCRYPTED(ot)) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Later passes of sync-to-convergence may decide to rewrite data * in place to avoid more disk reallocations. This presents a problem * for encryption because this constitutes rewriting the new data with * the same encryption key and IV. However, this only applies to blocks * in the MOS (particularly the spacemaps) and we do not encrypt the * MOS. We assert that the zio is allocating or an intent log write * to enforce this. */ ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); ASSERT3U(psize, !=, 0); enc_buf = zio_buf_alloc(psize); eabd = abd_get_from_buf(enc_buf, psize); abd_take_ownership_of_buf(eabd, B_TRUE); /* * For an explanation of what encryption parameters are stored * where, see the block comment in zio_crypt.c. */ if (ot == DMU_OT_INTENT_LOG) { zio_crypt_decode_params_bp(bp, salt, iv); } else { BP_SET_CRYPT(bp, B_TRUE); } /* Perform the encryption. This should not fail */ VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); /* encode encryption metadata into the bp */ if (ot == DMU_OT_INTENT_LOG) { /* * ZIL blocks store the MAC in the embedded checksum, so the * transform must always be applied. */ zio_crypt_encode_mac_zil(enc_buf, mac); zio_push_transform(zio, eabd, psize, psize, NULL); } else { BP_SET_CRYPT(bp, B_TRUE); zio_crypt_encode_params_bp(bp, salt, iv); zio_crypt_encode_mac_bp(bp, mac); if (no_crypt) { ASSERT3U(ot, ==, DMU_OT_DNODE); abd_free(eabd); } else { zio_push_transform(zio, eabd, psize, psize, NULL); } } return (zio); } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static zio_t * zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (zio); } static zio_t * zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_checksum_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, &info); } } return (zio); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static zio_t * zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } #ifdef ZFS_DEBUG if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; #endif if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ metaslab_class_throttle_unreserve( zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio); zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (bp != NULL && BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (zio); } /* * Update the allocation throttle accounting. */ static void zio_dva_throttle_done(zio_t *zio) { zio_t *lio __maybe_unused = zio->io_logical; zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT(vd != NULL); ASSERT3P(vd, ==, vd->vdev_top); ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); /* * Parents of gang children can have two flavors -- ones that * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) * and ones that allocated the constituent blocks. The allocation * throttle needs to know the allocating parent zio so we must find * it here. */ if (pio->io_child_type == ZIO_CHILD_GANG) { /* * If our parent is a rewrite gang child then our grandparent * would have been the one that performed the allocation. */ if (pio->io_flags & ZIO_FLAG_IO_REWRITE) pio = zio_unique_parent(pio); flags |= METASLAB_GANG_CHILD; } ASSERT(IO_IS_ALLOCATING(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); ASSERT(zio->io_metaslab_class != NULL); mutex_enter(&pio->io_lock); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } static zio_t * zio_done(zio_t *zio) { /* * Always attempt to keep stack usage minimal here since * we can be called recursively up to 19 levels deep. */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; zio_link_t *zl = NULL; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { return (NULL); } /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { ASSERT(zio->io_metaslab_class != NULL); ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } /* * If the allocation throttle is enabled, verify that * we have decremented the refcounts for every I/O that was throttled. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class-> mc_allocator[zio->io_allocator].mca_alloc_slots, zio)); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); abd_t *adata = zio->io_abd; if (adata != NULL && asize != psize) { adata = abd_alloc(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, adata); zfs_ereport_free_checksum(zcr); if (adata != NULL && asize != psize) abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { /* * We want to only increment our slow IO counters if * the IO is valid (i.e. not if the drive is removed). * * zfs_ereport_post() will also do these checks, but * it can also ratelimit and have other failures, so we * need to increment the slow_io counters independent * of it. */ if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_slow_ios++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); } } } if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { mutex_enter(&zio->io_vd->vdev_stat_lock); if (zio->io_type == ZIO_TYPE_READ) zio->io_vd->vdev_stat.vs_read_errors++; else if (zio->io_type == ZIO_TYPE_WRITE) zio->io_vd->vdev_stat.vs_write_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); } } if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark, &zio->io_bp->blk_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } if (zio->io_error && zio == zio->io_logical) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); /* * This is a rare code path, so we don't * bother with "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; /* * This is a rare code path, so we don't bother with * "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, zio_reexecute, zio, 0, &zio->io_tqent); } return (NULL); } ASSERT(list_is_empty(&zio->io_child_list)); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * We are done executing this zio. We may want to execute a parent * next. See the comment in zio_notify_parent(). */ zio_t *next_to_execute = NULL; zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (next_to_execute); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_write_bp_init, zio_free_bp_init, zio_issue_async, zio_write_compress, zio_encrypt, zio_checksum_generate, zio_nop_write, zio_brt_free, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT); IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT0(last_block->zb_level); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } /* * This function is similar to zbookmark_subtree_completed(), but returns true * if subtree_root is equal or ahead of last_block, i.e. still to be done. */ boolean_t zbookmark_subtree_tbd(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { ASSERT0(last_block->zb_level); if (dnp == NULL) return (B_FALSE); return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root, last_block) >= 0); } EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW, "Prioritize requeued I/O"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, UINT, ZMOD_RW, "Defer frees starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW, "Don't compress starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW, "Rewrite new bps starting in this pass"); ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, "Throttle block allocations in the ZIO pipeline"); ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, "Log all slow ZIOs, not just those with vdevs"); diff --git a/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in b/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in index 23c3ed6ff408..d56967d7a8b1 100644 --- a/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in +++ b/sys/contrib/openzfs/rpm/generic/zfs-dkms.spec.in @@ -1,76 +1,160 @@ %{?!packager: %define packager Brian Behlendorf } %if ! 0%{?rhel}%{?fedora}%{?mageia}%{?suse_version}%{?openEuler} %define not_rpm 1 %endif # Exclude input files from mangling %global __brp_mangle_shebangs_exclude_from ^/usr/src/.*$ %define module @PACKAGE@ %define mkconf scripts/dkms.mkconf Name: %{module}-dkms Version: @VERSION@ Release: @RELEASE@%{?dist} Summary: Kernel module(s) (dkms) Group: System Environment/Kernel License: @ZFS_META_LICENSE@ URL: https://github.com/openzfs/zfs Source0: %{module}-%{version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) BuildArch: noarch Requires: dkms >= 2.2.0.3 +Requires(pre): dkms >= 2.2.0.3 Requires(post): dkms >= 2.2.0.3 Requires(preun): dkms >= 2.2.0.3 Requires: gcc, make, perl, diffutils Requires(post): gcc, make, perl, diffutils %if 0%{?rhel}%{?fedora}%{?mageia}%{?suse_version}%{?openEuler} Requires: kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999 Requires(post): kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999 Obsoletes: spl-dkms <= %{version} %endif Provides: %{module}-kmod = %{version} AutoReqProv: no %if (0%{?fedora}%{?suse_version}%{?openEuler}) || (0%{?rhel} && 0%{?rhel} < 9) # We don't directly use it, but if this isn't installed, rpmbuild as root can # crash+corrupt rpmdb # See issue #12071 BuildRequires: ncompress %endif %description This package contains the dkms ZFS kernel modules. %prep %setup -q -n %{module}-%{version} %build %{mkconf} -n %{module} -v %{version} -f dkms.conf %install if [ "$RPM_BUILD_ROOT" != "/" ]; then rm -rf $RPM_BUILD_ROOT fi mkdir -p $RPM_BUILD_ROOT/usr/src/ cp -rf ${RPM_BUILD_DIR}/%{module}-%{version} $RPM_BUILD_ROOT/usr/src/ %clean if [ "$RPM_BUILD_ROOT" != "/" ]; then rm -rf $RPM_BUILD_ROOT fi %files %defattr(-,root,root) /usr/src/%{module}-%{version} +%pre +echo "Running pre installation script: $0. Parameters: $*" +# We don't want any other versions lingering around in dkms. +# Tests with 'dnf' showed that in case of reinstall, or upgrade +# the preun scriptlet removed the version we are trying to install. +# Because of this, find all zfs dkms sources in /var/lib/dkms and +# remove them, if we find a matching version in dkms. + +dkms_root=/var/lib/dkms +if [ -d ${dkms_root}/%{module} ]; then + cd ${dkms_root}/%{module} + for x in [[:digit:]]*; do + [ -d "$x" ] || continue + otherver="$x" + opath="${dkms_root}/%{module}/${otherver}" + if [ "$otherver" != %{version} ]; then + # This is a workaround for a broken 'dkms status', we caused in a previous version. + # One day it might be not needed anymore, but it does not hurt to keep it. + if dkms status -m %{module} -v "$otherver" 2>&1 | grep "${opath}/source/dkms.conf does not exist" + then + echo "ERROR: dkms status is broken!" >&2 + if [ -L "${opath}/source" -a ! -d "${opath}/source" ] + then + echo "Trying to fix it by removing the symlink: ${opath}/source" >&2 + echo "You should manually remove ${opath}" >&2 + rm -f "${opath}/source" || echo "Removal failed!" >&2 + fi + fi + if [ `dkms status -m %{module} -v "$otherver" | grep -c %{module}` -gt 0 ]; then + echo "Removing old %{module} dkms modules version $otherver from all kernels." + dkms remove -m %{module} -v "$otherver" --all ||: + fi + fi + done +fi + +# Uninstall this version of zfs dkms modules before installation of the package. +if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then + echo "Removing %{module} dkms modules version %{version} from all kernels." + dkms remove -m %{module} -v %{version} --all ||: +fi + +%post +echo "Running post installation script: $0. Parameters: $*" +# Add the module to dkms, as reccommended in the dkms man page. +# This is generally rpm specfic. +# But this also may help, if we have a broken 'dkms status'. +# Because, if the sources are available and only the symlink pointing +# to them is missing, this will resolve the situation +echo "Adding %{module} dkms modules version %{version} to dkms." +dkms add -m %{module} -v %{version} %{!?not_rpm:--rpm_safe_upgrade} ||: + +# After installing the package, dkms install this zfs version for the current kernel. +# Force the overwriting of old modules to avoid diff warnings in dkms status. +# Or in case of a downgrade to overwrite newer versions. +# Or if some other backed up versions have been restored before. +echo "Installing %{module} dkms modules version %{version} for the current kernel." +dkms install --force -m %{module} -v %{version} ||: + %preun -dkms remove -m %{module} -v %{version} --all +dkms_root="/var/lib/dkms/%{module}/%{version}" +echo "Running pre uninstall script: $0. Parameters: $*" +# In case of upgrade we do nothing. See above comment in pre hook. +if [ "$1" = "1" -o "$1" = "upgrade" ] ; then + echo "This is an upgrade. Skipping pre uninstall action." + exit 0 +fi + +# Check if we uninstall the package. In that case remove the dkms modules. +# '0' is the value for the first parameter for rpm packages. +# 'remove' or 'purge' are the possible names for deb packages. +if [ "$1" = "0" -o "$1" = "remove" -o "$1" = "purge" ] ; then + if [ `dkms status -m %{module} -v %{version} | grep -c %{module}` -gt 0 ]; then + echo "Removing %{module} dkms modules version %{version} from all kernels." + dkms remove -m %{module} -v %{version} --all %{!?not_rpm:--rpm_safe_upgrade} && exit 0 + fi + # If removing the modules failed, it might be because of the broken 'dkms status'. + if dkms status -m %{module} -v %{version} 2>&1 | grep "${dkms_root}/source/dkms.conf does not exist" + then + echo "ERROR: dkms status is broken!" >&2 + echo "You should manually remove ${dkms_root}" >&2 + echo "WARNING: installed modules in /lib/modules/`uname -r`/extra could not be removed automatically!" >&2 + fi +else + echo "Script parameter $1 did not match any removal condition." +fi -%posttrans -/usr/lib/dkms/common.postinst %{module} %{version} +exit 0 diff --git a/sys/contrib/openzfs/scripts/Makefile.am b/sys/contrib/openzfs/scripts/Makefile.am index 95640727ac6a..b43bf97dbdf4 100644 --- a/sys/contrib/openzfs/scripts/Makefile.am +++ b/sys/contrib/openzfs/scripts/Makefile.am @@ -1,95 +1,97 @@ scriptsdir = $(datadir)/$(PACKAGE) dist_noinst_SCRIPTS = \ %D%/commitcheck.sh \ %D%/common.sh.in \ %D%/dkms.mkconf \ %D%/dkms.postbuild \ %D%/kmodtool \ %D%/make_gitrev.sh \ %D%/man-dates.sh \ %D%/mancheck.sh \ %D%/paxcheck.sh \ %D%/zfs-tests-color.sh scripts_scripts = \ %D%/zfs-helpers.sh \ %D%/zfs-tests.sh \ %D%/zfs.sh \ %D%/zimport.sh \ %D%/zloop.sh if CONFIG_USER dist_scripts_SCRIPTS = $(scripts_scripts) +dist_zfsexec_SCRIPTS = \ + %D%/zfs_prepare_disk else dist_noinst_SCRIPTS += $(scripts_scripts) endif dist_noinst_DATA += \ %D%/cstyle.pl \ %D%/enum-extract.pl \ %D%/update_authors.pl \ %D%/zfs2zol-patch.sed \ %D%/zol2zfs-patch.sed SHELLCHECKSCRIPTS += $(dist_scripts_SCRIPTS) $(dist_noinst_SCRIPTS) define SCRIPTS_EXTRA_ENVIRONMENT # Only required for in-tree use export INTREE="yes" export GDB="libtool --mode=execute gdb" export LDMOD=/sbin/insmod export CMD_DIR=$(abs_top_builddir) export UDEV_SCRIPT_DIR=$(abs_top_srcdir)/udev export UDEV_CMD_DIR=$(abs_top_builddir)/udev export UDEV_RULE_DIR=$(abs_top_builddir)/udev/rules.d export ZEDLET_ETC_DIR=$$CMD_DIR/cmd/zed/zed.d export ZEDLET_LIBEXEC_DIR=$$CMD_DIR/cmd/zed/zed.d export ZPOOL_SCRIPT_DIR=$$CMD_DIR/cmd/zpool/zpool.d export ZPOOL_SCRIPTS_PATH=$$CMD_DIR/cmd/zpool/zpool.d export ZPOOL_COMPAT_DIR=$$CMD_DIR/cmd/zpool/compatibility.d export CONTRIB_DIR=$(abs_top_builddir)/contrib export LIB_DIR=$(abs_top_builddir)/.libs export SYSCONF_DIR=$(abs_top_builddir)/etc export INSTALL_UDEV_DIR=$(udevdir) export INSTALL_UDEV_RULE_DIR=$(udevruledir) export INSTALL_MOUNT_HELPER_DIR=$(mounthelperdir) export INSTALL_SYSCONF_DIR=$(sysconfdir) export INSTALL_PYTHON_DIR=$(pythonsitedir) export KMOD_SPL=$(abs_top_builddir)/module/spl.ko export KMOD_ZFS=$(abs_top_builddir)/module/zfs.ko export KMOD_FREEBSD=$(abs_top_builddir)/module/openzfs.ko endef export SCRIPTS_EXTRA_ENVIRONMENT CLEANFILES += %D%/common.sh %D%/common.sh: %D%/common.sh.in Makefile -$(AM_V_at)$(MKDIR_P) $(@D) -$(AM_V_GEN)$(SED) -e '/^export BIN_DIR=/s|$$|$(abs_top_builddir)/tests/zfs-tests/bin|' \ -e '/^export SBIN_DIR=/s|$$|$(abs_top_builddir)|' \ -e '/^export LIBEXEC_DIR=/s|$$|$(abs_top_builddir)|' \ -e '/^export ZTS_DIR=/s|$$|$(abs_top_srcdir)/tests|' \ -e '/^export SCRIPT_DIR=/s|$$|$(abs_top_srcdir)/scripts|' \ $< >$@ -$(AM_V_at)echo "$$SCRIPTS_EXTRA_ENVIRONMENT" >>$@ ALL_LOCAL += scripts-all-local scripts-all-local: %D%/common.sh -SCRIPT_COMMON=$< $(srcdir)/%D%/zfs-tests.sh -c CLEAN_LOCAL += scripts-clean-local scripts-clean-local: -$(RM) -r tests/zfs-tests/bin/ INSTALL_DATA_HOOKS += scripts-install-data-hook scripts-install-data-hook: %D%/common.sh.in Makefile -$(SED) -e '/^export BIN_DIR=/s|$$|$(bindir)|' \ -e '/^export SBIN_DIR=/s|$$|$(sbindir)|' \ -e '/^export LIBEXEC_DIR=/s|$$|$(zfsexecdir)|' \ -e '/^export ZTS_DIR=/s|$$|$(datadir)/$(PACKAGE)|' \ -e '/^export SCRIPT_DIR=/s|$$|$(datadir)/$(PACKAGE)|' \ $< >$(DESTDIR)$(datadir)/$(PACKAGE)/common.sh diff --git a/sys/contrib/openzfs/scripts/zfs_prepare_disk b/sys/contrib/openzfs/scripts/zfs_prepare_disk new file mode 100755 index 000000000000..02aa9f8a7728 --- /dev/null +++ b/sys/contrib/openzfs/scripts/zfs_prepare_disk @@ -0,0 +1,17 @@ +#!/bin/sh +# +# This is an optional helper script that is automatically called by libzfs +# before a disk is about to be added into the pool. It can be modified by +# the user to run whatever commands are necessary to prepare a disk for +# inclusion into the pool. For example, users can add lines to this +# script to do things like update the drive's firmware or check the drive's +# health. The script is optional and can be removed if it is not needed. +# +# See the zfs_prepare_disk(8) man page for details. +# +# Example: +# +# echo "Prepare disk $VDEV_PATH ($VDEV_UPATH) for $VDEV_PREPARE in $POOL_NAME" +# + +exit 0 diff --git a/sys/contrib/openzfs/tests/runfiles/linux.run b/sys/contrib/openzfs/tests/runfiles/linux.run index 2252e46df3a8..8bc55a1b4b47 100644 --- a/sys/contrib/openzfs/tests/runfiles/linux.run +++ b/sys/contrib/openzfs/tests/runfiles/linux.run @@ -1,228 +1,228 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # [DEFAULT] pre = setup quiet = False pre_user = root user = root timeout = 600 post_user = root post = cleanup failsafe_user = root failsafe = callbacks/zfs_failsafe outputdir = /var/tmp/test_results tags = ['functional'] [tests/functional/acl/posix:Linux] tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos', 'posix_004_pos'] tags = ['functional', 'acl', 'posix'] [tests/functional/acl/posix-sa:Linux] tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos', 'posix_004_pos'] tags = ['functional', 'acl', 'posix-sa'] [tests/functional/atime:Linux] tests = ['atime_003_pos', 'root_relatime_on'] tags = ['functional', 'atime'] [tests/functional/block_cloning:Linux] tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_copyfilerange_fallback', 'block_cloning_ficlone', 'block_cloning_ficlonerange', 'block_cloning_ficlonerange_partial', 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', 'block_cloning_disabled_ficlonerange', 'block_cloning_copyfilerange_cross_dataset', 'block_cloning_copyfilerange_fallback_same_txg'] tags = ['functional', 'block_cloning'] [tests/functional/chattr:Linux] tests = ['chattr_001_pos', 'chattr_002_neg'] tags = ['functional', 'chattr'] [tests/functional/cli_root/zfs:Linux] tests = ['zfs_003_neg'] tags = ['functional', 'cli_root', 'zfs'] [tests/functional/cli_root/zfs_mount:Linux] tests = ['zfs_mount_006_pos', 'zfs_mount_008_pos', 'zfs_mount_013_pos', 'zfs_mount_014_neg', 'zfs_multi_mount'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_share:Linux] tests = ['zfs_share_005_pos', 'zfs_share_007_neg', 'zfs_share_009_neg', 'zfs_share_012_pos', 'zfs_share_013_pos'] tags = ['functional', 'cli_root', 'zfs_share'] [tests/functional/cli_root/zfs_unshare:Linux] tests = ['zfs_unshare_008_pos'] tags = ['functional', 'cli_root', 'zfs_unshare'] [tests/functional/cli_root/zfs_sysfs:Linux] tests = ['zfeature_set_unsupported', 'zfs_get_unsupported', 'zfs_set_unsupported', 'zfs_sysfs_live', 'zpool_get_unsupported', 'zpool_set_unsupported'] tags = ['functional', 'cli_root', 'zfs_sysfs'] [tests/functional/cli_root/zpool_add:Linux] tests = ['add_nested_replacing_spare'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_expand:Linux] tests = ['zpool_expand_001_pos', 'zpool_expand_002_pos', 'zpool_expand_003_neg', 'zpool_expand_004_pos', 'zpool_expand_005_pos'] tags = ['functional', 'cli_root', 'zpool_expand'] [tests/functional/cli_root/zpool_import:Linux] tests = ['zpool_import_hostid_changed', 'zpool_import_hostid_changed_unclean_export', 'zpool_import_hostid_changed_cachefile', 'zpool_import_hostid_changed_cachefile_unclean_export'] tags = ['functional', 'cli_root', 'zpool_import'] [tests/functional/cli_root/zpool_reopen:Linux] tests = ['zpool_reopen_001_pos', 'zpool_reopen_002_pos', 'zpool_reopen_003_pos', 'zpool_reopen_004_pos', 'zpool_reopen_005_pos', 'zpool_reopen_006_neg', 'zpool_reopen_007_pos'] tags = ['functional', 'cli_root', 'zpool_reopen'] [tests/functional/cli_root/zpool_split:Linux] tests = ['zpool_split_wholedisk'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/compression:Linux] tests = ['compress_004_pos'] tags = ['functional', 'compression'] [tests/functional/devices:Linux] tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] tags = ['functional', 'devices'] [tests/functional/events:Linux] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config'] tags = ['functional', 'events'] [tests/functional/fadvise:Linux] tests = ['fadvise_sequential'] tags = ['functional', 'fadvise'] [tests/functional/fallocate:Linux] tests = ['fallocate_prealloc', 'fallocate_zero-range'] tags = ['functional', 'fallocate'] [tests/functional/fault:Linux] tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', - 'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', - 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', - 'decrypt_fault', 'decompress_fault', 'scrub_after_resilver', - 'zpool_status_-s'] + 'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos', + 'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift', + 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', + 'scrub_after_resilver', 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos'] tags = ['functional', 'features', 'large_dnode'] [tests/functional/io:Linux] tests = ['libaio', 'io_uring'] tags = ['functional', 'io'] [tests/functional/largest_pool:Linux] tests = ['largest_pool_001_pos'] pre = post = tags = ['functional', 'largest_pool'] [tests/functional/mmap:Linux] tests = ['mmap_libaio_001_pos', 'mmap_sync_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mmp:Linux] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] tags = ['functional', 'mmp'] [tests/functional/mount:Linux] tests = ['umount_unlinked_drain'] tags = ['functional', 'mount'] [tests/functional/pam:Linux] tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', 'pam_short_password'] tags = ['functional', 'pam'] [tests/functional/procfs:Linux] tests = ['procfs_list_basic', 'procfs_list_concurrent_readers', 'procfs_list_stale_read', 'pool_state'] tags = ['functional', 'procfs'] [tests/functional/projectquota:Linux] tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos', 'projectquota_001_pos', 'projectquota_002_pos', 'projectquota_003_pos', 'projectquota_004_neg', 'projectquota_005_pos', 'projectquota_006_pos', 'projectquota_007_pos', 'projectquota_008_pos', 'projectquota_009_pos', 'projectspace_001_pos', 'projectspace_002_pos', 'projectspace_003_pos', 'projectspace_004_pos', 'projecttree_001_pos', 'projecttree_002_pos', 'projecttree_003_neg'] tags = ['functional', 'projectquota'] [tests/functional/dos_attributes:Linux] tests = ['read_dos_attrs_001', 'write_dos_attrs_001'] tags = ['functional', 'dos_attributes'] [tests/functional/renameat2:Linux] tests = ['renameat2_noreplace', 'renameat2_exchange', 'renameat2_whiteout'] tags = ['functional', 'renameat2'] [tests/functional/rsend:Linux] tests = ['send_realloc_dnode_size', 'send_encrypted_files'] tags = ['functional', 'rsend'] [tests/functional/simd:Linux] pre = post = tests = ['simd_supported'] tags = ['functional', 'simd'] [tests/functional/snapshot:Linux] tests = ['snapshot_015_pos', 'snapshot_016_pos'] tags = ['functional', 'snapshot'] [tests/functional/tmpfile:Linux] tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos', 'tmpfile_stat_mode'] tags = ['functional', 'tmpfile'] [tests/functional/upgrade:Linux] tests = ['upgrade_projectquota_001_pos'] tags = ['functional', 'upgrade'] [tests/functional/user_namespace:Linux] tests = ['user_namespace_001', 'user_namespace_002', 'user_namespace_003', 'user_namespace_004'] tags = ['functional', 'user_namespace'] [tests/functional/userquota:Linux] tests = ['groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos', 'userquota_013_pos', 'userspace_003_pos'] tags = ['functional', 'userquota'] [tests/functional/zvol/zvol_misc:Linux] tests = ['zvol_misc_fua'] tags = ['functional', 'zvol', 'zvol_misc'] [tests/functional/idmap_mount:Linux] tests = ['idmap_mount_001', 'idmap_mount_002', 'idmap_mount_003', 'idmap_mount_004', 'idmap_mount_005'] tags = ['functional', 'idmap_mount'] diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in index 5d1360380de5..4608e87522a3 100755 --- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in @@ -1,475 +1,476 @@ #!/usr/bin/env @PYTHON_SHEBANG@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2017 by Delphix. All rights reserved. # Copyright (c) 2018 by Lawrence Livermore National Security, LLC. # # This script must remain compatible with Python 3.6+. # import os import re import sys import argparse # # This script parses the stdout of zfstest, which has this format: # # Test: /path/to/testa (run as root) [00:00] [PASS] # Test: /path/to/testb (run as jkennedy) [00:00] [PASS] # Test: /path/to/testc (run as root) [00:00] [FAIL] # [...many more results...] # # Results Summary # FAIL 22 # SKIP 32 # PASS 1156 # # Running Time: 02:50:31 # Percent passed: 95.5% # Log directory: /var/tmp/test_results/20180615T205926 # # # Common generic reasons for a test or test group to be skipped. # # Some test cases are known to fail in ways which are not harmful or dangerous. # In these cases simply mark the test as a known failure until it can be # updated and the issue resolved. Note that it's preferable to open a unique # issue on the GitHub issue tracker for each test case failure. # known_reason = 'Known issue' # # Some tests require that a test user be able to execute the zfs utilities. # This may not be possible when testing in-tree due to the default permissions # on the user's home directory. When testing this can be resolved by granting # group read access. # # chmod 0750 $HOME # exec_reason = 'Test user execute permissions required for utilities' # # Some tests require that the kernel supports renameat2 syscall. # renameat2_reason = 'Kernel renameat2 support required' # # Some tests require the O_TMPFILE flag which was first introduced in the # 3.11 kernel. # tmpfile_reason = 'Kernel O_TMPFILE support required' # # Some tests require the statx(2) system call on Linux which was first # introduced in the 4.11 kernel. # statx_reason = 'Kernel statx(2) system call required on Linux' # # Some tests require that the lsattr utility support the project id feature. # project_id_reason = 'lsattr with set/show project ID required' # # Some tests require that the kernel support user namespaces. # user_ns_reason = 'Kernel user namespace support required' # # Some rewind tests can fail since nothing guarantees that old MOS blocks # are not overwritten. Snapshots protect datasets and data files but not # the MOS. Reasonable efforts are made in the test case to increase the # odds that some txgs will have their MOS data left untouched, but it is # never a sure thing. # rewind_reason = 'Arbitrary pool rewind is not guaranteed' # # Some tests require a minimum version of the fio benchmark utility. # Older distributions such as CentOS 6.x only provide fio-2.0.13. # fio_reason = 'Fio v2.3 or newer required' # # Some tests require that the DISKS provided support the discard operation. # Normally this is not an issue because loop back devices are used for DISKS # and they support discard (TRIM/UNMAP). # trim_reason = 'DISKS must support discard (TRIM/UNMAP)' # # Some tests on FreeBSD require the fspacectl(2) system call and the # truncate(1) utility supporting the -d option. The system call was first # introduced in FreeBSD version 1400032. # fspacectl_reason = 'fspacectl(2) and truncate -d support required' # # Some tests are not applicable to a platform or need to be updated to operate # in the manor required by the platform. Any tests which are skipped for this # reason will be suppressed in the final analysis output. # na_reason = "Not applicable" # # Some test cases doesn't have all requirements to run on Github actions CI. # ci_reason = 'CI runner doesn\'t have all requirements' # # Idmapped mount is only supported in kernel version >= 5.12 # idmap_reason = 'Idmapped mount needs kernel 5.12+' # # copy_file_range() is not supported by all kernels # cfr_reason = 'Kernel copy_file_range support required' cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+' # # These tests are known to fail, thus we use this list to prevent these # failures from failing the job as a whole; only unexpected failures # bubble up to cause this script to exit with a non-zero exit status. # # Format: { 'test-name': ['expected result', 'issue-number | reason'] } # # For each known failure it is recommended to link to a GitHub issue by # setting the reason to the issue number. Alternately, one of the generic # reasons listed above can be used. # known = { 'casenorm/mixed_none_lookup_ci': ['FAIL', 7633], 'casenorm/mixed_formd_lookup_ci': ['FAIL', 7633], 'cli_root/zpool_import/import_rewind_device_replaced': ['FAIL', rewind_reason], 'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason], 'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason], 'pool_checkpoint/checkpoint_discard_busy': ['SKIP', 12053], 'privilege/setup': ['SKIP', na_reason], 'refreserv/refreserv_004_pos': ['FAIL', known_reason], 'rootpool/setup': ['SKIP', na_reason], 'rsend/rsend_008_pos': ['SKIP', 6066], 'vdev_zaps/vdev_zaps_007_pos': ['FAIL', known_reason], } if sys.platform.startswith('freebsd'): known.update({ 'cli_root/zfs_receive/receive-o-x_props_override': ['FAIL', known_reason], 'cli_root/zpool_resilver/zpool_resilver_concurrent': ['SKIP', na_reason], 'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], 'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason], 'link_count/link_count_001': ['SKIP', na_reason], 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], 'rsend/send_raw_ashift': ['SKIP', 14961], }) elif sys.platform.startswith('linux'): known.update({ 'casenorm/mixed_formd_lookup': ['FAIL', 7633], 'casenorm/mixed_formd_delete': ['FAIL', 7633], 'casenorm/sensitive_formd_lookup': ['FAIL', 7633], 'casenorm/sensitive_formd_delete': ['FAIL', 7633], 'removal/removal_with_zdb': ['SKIP', known_reason], 'cli_root/zfs_unshare/zfs_unshare_002_pos': ['SKIP', na_reason], }) # # These tests may occasionally fail or be skipped. We want there failures # to be reported but only unexpected failures should bubble up to cause # this script to exit with a non-zero exit status. # # Format: { 'test-name': ['expected result', 'issue-number | reason'] } # # For each known failure it is recommended to link to a GitHub issue by # setting the reason to the issue number. Alternately, one of the generic # reasons listed above can be used. # maybe = { 'append/threadsappend_001_pos': ['FAIL', 6136], 'chattr/setup': ['SKIP', exec_reason], 'crtime/crtime_001_pos': ['SKIP', statx_reason], 'cli_root/zdb/zdb_006_pos': ['FAIL', known_reason], 'cli_root/zfs_destroy/zfs_destroy_dev_removal_condense': ['FAIL', known_reason], 'cli_root/zfs_get/zfs_get_004_pos': ['FAIL', known_reason], 'cli_root/zfs_get/zfs_get_009_pos': ['SKIP', 5479], 'cli_root/zfs_rollback/zfs_rollback_001_pos': ['FAIL', known_reason], 'cli_root/zfs_rollback/zfs_rollback_002_pos': ['FAIL', known_reason], 'cli_root/zfs_share/zfs_share_concurrent_shares': ['FAIL', known_reason], 'cli_root/zfs_snapshot/zfs_snapshot_002_neg': ['FAIL', known_reason], 'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason], 'cli_root/zpool_add/zpool_add_004_pos': ['FAIL', known_reason], 'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', 6145], 'cli_root/zpool_import/zpool_import_missing_003_pos': ['SKIP', 6839], 'cli_root/zpool_initialize/zpool_initialize_import_export': ['FAIL', 11948], 'cli_root/zpool_labelclear/zpool_labelclear_removed': ['FAIL', known_reason], 'cli_root/zpool_trim/setup': ['SKIP', trim_reason], 'cli_root/zpool_upgrade/zpool_upgrade_004_pos': ['FAIL', 6141], 'delegate/setup': ['SKIP', exec_reason], 'fallocate/fallocate_punch-hole': ['SKIP', fspacectl_reason], 'history/history_004_pos': ['FAIL', 7026], 'history/history_005_neg': ['FAIL', 6680], 'history/history_006_neg': ['FAIL', 5657], 'history/history_008_pos': ['FAIL', known_reason], 'history/history_010_pos': ['SKIP', exec_reason], 'io/mmap': ['SKIP', fio_reason], 'largest_pool/largest_pool_001_pos': ['FAIL', known_reason], 'mmp/mmp_on_uberblocks': ['FAIL', known_reason], 'pam/setup': ['SKIP', "pamtester might be not available"], 'pool_checkpoint/checkpoint_discard_busy': ['FAIL', 11946], 'projectquota/setup': ['SKIP', exec_reason], 'removal/removal_condense_export': ['FAIL', known_reason], 'renameat2/setup': ['SKIP', renameat2_reason], 'reservation/reservation_008_pos': ['FAIL', 7741], 'reservation/reservation_018_pos': ['FAIL', 5642], 'snapshot/clone_001_pos': ['FAIL', known_reason], 'snapshot/snapshot_009_pos': ['FAIL', 7961], 'snapshot/snapshot_010_pos': ['FAIL', 7961], 'snapused/snapused_004_pos': ['FAIL', 5513], 'tmpfile/setup': ['SKIP', tmpfile_reason], 'trim/setup': ['SKIP', trim_reason], 'upgrade/upgrade_projectquota_001_pos': ['SKIP', project_id_reason], 'user_namespace/setup': ['SKIP', user_ns_reason], 'userquota/setup': ['SKIP', exec_reason], 'vdev_zaps/vdev_zaps_004_pos': ['FAIL', known_reason], 'zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos': ['FAIL', 5848], } if sys.platform.startswith('freebsd'): maybe.update({ 'cli_root/zfs_copies/zfs_copies_002_pos': ['FAIL', known_reason], 'cli_root/zfs_inherit/zfs_inherit_001_neg': ['FAIL', known_reason], 'cli_root/zpool_import/zpool_import_012_pos': ['FAIL', known_reason], 'delegate/zfs_allow_003_pos': ['FAIL', known_reason], 'inheritance/inherit_001_pos': ['FAIL', 11829], 'resilver/resilver_restart_001': ['FAIL', known_reason], 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], 'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623], 'snapshot/snapshot_002_pos': ['FAIL', '14831'], }) elif sys.platform.startswith('linux'): maybe.update({ 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], 'fault/auto_online_002_pos': ['FAIL', 11889], 'fault/auto_replace_001_pos': ['FAIL', 14851], 'fault/auto_spare_002_pos': ['FAIL', 11889], 'fault/auto_spare_multiple': ['FAIL', 11889], 'fault/auto_spare_shared': ['FAIL', 11889], 'fault/decompress_fault': ['FAIL', 11889], 'io/io_uring': ['SKIP', 'io_uring support required'], 'limits/filesystem_limit': ['SKIP', known_reason], 'limits/snapshot_limit': ['SKIP', known_reason], 'mmp/mmp_active_import': ['FAIL', known_reason], 'mmp/mmp_exported_import': ['FAIL', known_reason], 'mmp/mmp_inactive_import': ['FAIL', known_reason], 'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621], 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], 'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872], 'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872], 'idmap_mount/idmap_mount_001': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_002': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_004': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_005': ['SKIP', idmap_reason], 'block_cloning/block_cloning_disabled_copyfilerange': ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange': ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_partial': ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_fallback': ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': ['SKIP', cfr_cross_reason], 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': ['SKIP', cfr_cross_reason], }) # Not all Github actions runners have scsi_debug module, so we may skip # some tests which use it. if os.environ.get('CI') == 'true': known.update({ 'cli_root/zpool_expand/zpool_expand_001_pos': ['SKIP', ci_reason], 'cli_root/zpool_expand/zpool_expand_003_neg': ['SKIP', ci_reason], 'cli_root/zpool_expand/zpool_expand_005_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/setup': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_001_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_002_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_004_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_005_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_006_neg': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_007_pos': ['SKIP', ci_reason], 'cli_root/zpool_split/zpool_split_wholedisk': ['SKIP', ci_reason], 'fault/auto_offline_001_pos': ['SKIP', ci_reason], 'fault/auto_online_001_pos': ['SKIP', ci_reason], 'fault/auto_online_002_pos': ['SKIP', ci_reason], 'fault/auto_replace_001_pos': ['SKIP', ci_reason], + 'fault/auto_replace_002_pos': ['SKIP', ci_reason], 'fault/auto_spare_ashift': ['SKIP', ci_reason], 'fault/auto_spare_shared': ['SKIP', ci_reason], 'procfs/pool_state': ['SKIP', ci_reason], }) maybe.update({ 'events/events_002_pos': ['FAIL', 11546], }) def process_results(pathname): try: f = open(pathname) except IOError as e: print('Error opening file:', e) sys.exit(1) prefix = '/zfs-tests/tests/(?:functional|perf/regression)/' pattern = \ r'^Test(?:\s+\(\S+\))?:' + \ rf'\s*\S*{prefix}(\S+)' + \ r'\s*\(run as (\S+)\)\s*\[(\S+)\]\s*\[(\S+)\]' pattern_log = r'^\s*Log directory:\s*(\S*)' d = {} logdir = 'Could not determine log directory.' for line in f.readlines(): m = re.match(pattern, line) if m and len(m.groups()) == 4: d[m.group(1)] = m.group(4) continue m = re.match(pattern_log, line) if m: logdir = m.group(1) return d, logdir class ListMaybesAction(argparse.Action): def __init__(self, option_strings, dest="SUPPRESS", default="SUPPRESS", help="list flaky tests and exit"): super(ListMaybesAction, self).__init__( option_strings=option_strings, dest=dest, default=default, nargs=0, help=help) def __call__(self, parser, namespace, values, option_string=None): for test in maybe: print(test) sys.exit(0) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Analyze ZTS logs') parser.add_argument('logfile') parser.add_argument('--list-maybes', action=ListMaybesAction) parser.add_argument('--no-maybes', action='store_false', dest='maybes') args = parser.parse_args() results, logdir = process_results(args.logfile) if not results: print("\n\nNo test results were found.") print("Log directory:", logdir) sys.exit(0) expected = [] unexpected = [] all_maybes = True for test in list(results.keys()): if results[test] == "PASS": continue setup = test.replace(os.path.basename(test), "setup") if results[test] == "SKIP" and test != setup: if setup in known and known[setup][0] == "SKIP": continue if setup in maybe and maybe[setup][0] == "SKIP": continue if (test in known and results[test] in known[test][0]): expected.append(test) elif test in maybe and results[test] in maybe[test][0]: if results[test] == 'SKIP' or args.maybes: expected.append(test) elif not args.maybes: unexpected.append(test) else: unexpected.append(test) all_maybes = False print("\nTests with results other than PASS that are expected:") for test in sorted(expected): issue_url = 'https://github.com/openzfs/zfs/issues/' # Include the reason why the result is expected, given the following: # 1. Suppress test results which set the "Not applicable" reason. # 2. Numerical reasons are assumed to be GitHub issue numbers. # 3. When an entire test group is skipped only report the setup reason. if test in known: if known[test][1] == na_reason: continue elif isinstance(known[test][1], int): expect = f"{issue_url}{known[test][1]}" else: expect = known[test][1] elif test in maybe: if isinstance(maybe[test][1], int): expect = f"{issue_url}{maybe[test][1]}" else: expect = maybe[test][1] elif setup in known and known[setup][0] == "SKIP" and setup != test: continue elif setup in maybe and maybe[setup][0] == "SKIP" and setup != test: continue else: expect = "UNKNOWN REASON" print(f" {results[test]} {test} ({expect})") print("\nTests with result of PASS that are unexpected:") for test in sorted(known.keys()): # We probably should not be silently ignoring the case # where "test" is not in "results". if test not in results or results[test] != "PASS": continue print(f" {results[test]} {test} (expected {known[test][0]})") print("\nTests with results other than PASS that are unexpected:") for test in sorted(unexpected): expect = "PASS" if test not in known else known[test][0] print(f" {results[test]} {test} (expected {expect})") if len(unexpected) == 0: sys.exit(0) elif not args.maybes and all_maybes: sys.exit(2) else: sys.exit(1) diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg index fa545e06bbf3..648f2203dfba 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg @@ -1,233 +1,234 @@ # # Copyright (c) 2016, 2019 by Delphix. All rights reserved. # These variables are used by zfs-tests.sh to constrain which utilities # may be used by the suite. The suite will create a directory which is # the only element of $PATH and create symlinks from that dir to the # binaries listed below. # # Please keep the contents of each variable sorted for ease of reading # and maintenance. # export SYSTEM_FILES_COMMON='awk basename bc bunzip2 bzcat cat chgrp chmod chown cksum cmp cp cpio cut date dd df diff dirname dmesg du echo env expr false file find fio getconf getent getfacl grep gunzip gzip head hostname id iostat kill ksh ldd ln ls mkdir mknod mkfifo mktemp mount mv net od openssl pamtester pax pgrep ping pkill printf ps python3 readlink rm rmdir rsync scp script sed seq setfacl sh sleep sort ssh stat strings sudo swapoff swapon sync tail tar timeout touch tr true truncate umount uname uniq vmstat wc' export SYSTEM_FILES_FREEBSD='chflags compress diskinfo fsck getextattr gpart jail jexec jls lsextattr md5 mdconfig newfs pw rmextattr setextattr sha256 showmount swapctl sysctl trim uncompress' export SYSTEM_FILES_LINUX='attr blkid blkdiscard blockdev chattr exportfs fallocate + flock free getfattr groupadd groupdel groupmod hostid + logger losetup lsattr lsblk lscpu lsmod lsscsi md5sum mkswap modprobe + mountpoint mpstat nsenter parted perf setfattr + setpriv sha256sum udevadm unshare useradd userdel usermod - setpriv - mountpoint - flock - logger' + wipefs' export ZFS_FILES='zdb zfs zhack zinject zpool ztest raidz_test arc_summary arcstat zilstat dbufstat mount.zfs zed zgenhostid zstream zfs_ids_to_path zpool_influxdb' export ZFSTEST_FILES='badsend btree_test chg_usr_exec clonefile devname2devid dir_rd_update draid file_fadvise file_append file_check file_trunc file_write get_diff getversion largest_file libzfs_input_check mkbusy mkfile mkfiles mktree mmap_exec mmap_libaio mmap_seek mmap_sync mmapwrite nvlist_to_lua randfree_file randwritecomp readmmap read_dos_attributes renameat2 rename_dir rm_lnkcnt_zero_file send_doall threadsappend user_ns_exec write_dos_attributes xattrtest stride_dd zed_fd_spill-zedlet suid_write_to_file cp_files blake3_test edonr_test skein_test sha2_test ctime truncate_test ereports zfs_diff-socket dosmode_readonly_write idmap_util' diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib index 844caa17d8ed..b4d2b91dd476 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib @@ -1,3875 +1,3896 @@ # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2009, Sun Microsystems Inc. All rights reserved. # Copyright (c) 2012, 2020, Delphix. All rights reserved. # Copyright (c) 2017, Tim Chase. All rights reserved. # Copyright (c) 2017, Nexenta Systems Inc. All rights reserved. # Copyright (c) 2017, Lawrence Livermore National Security LLC. # Copyright (c) 2017, Datto Inc. All rights reserved. # Copyright (c) 2017, Open-E Inc. All rights reserved. # Copyright (c) 2021, The FreeBSD Foundation. # Use is subject to license terms. # . ${STF_SUITE}/include/tunables.cfg . ${STF_TOOLS}/include/logapi.shlib . ${STF_SUITE}/include/math.shlib . ${STF_SUITE}/include/blkdev.shlib +# On AlmaLinux 9 we will see $PWD = '.' instead of the full path. This causes +# some tests to fail. Fix it up here. +if [ "$PWD" = "." ] ; then + PWD="$(readlink -f $PWD)" +fi + # # Apply constrained path when available. This is required since the # PATH may have been modified by sudo's secure_path behavior. # if [ -n "$STF_PATH" ]; then export PATH="$STF_PATH" fi # # Generic dot version comparison function # # Returns success when version $1 is greater than or equal to $2. # function compare_version_gte { [ "$(printf "$1\n$2" | sort -V | tail -n1)" = "$1" ] } # Linux kernel version comparison function # # $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version # # Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ] # function linux_version { typeset ver="$1" [ -z "$ver" ] && ver=$(uname -r | grep -Eo "^[0-9]+\.[0-9]+\.[0-9]+") typeset version major minor _ IFS='.' read -r version major minor _ <<<"$ver" [ -z "$version" ] && version=0 [ -z "$major" ] && major=0 [ -z "$minor" ] && minor=0 echo $((version * 100000 + major * 1000 + minor)) } # Determine if this is a Linux test system # # Return 0 if platform Linux, 1 if otherwise function is_linux { [ "$UNAME" = "Linux" ] } # Determine if this is an illumos test system # # Return 0 if platform illumos, 1 if otherwise function is_illumos { [ "$UNAME" = "illumos" ] } # Determine if this is a FreeBSD test system # # Return 0 if platform FreeBSD, 1 if otherwise function is_freebsd { [ "$UNAME" = "FreeBSD" ] } # Determine if this is a 32-bit system # # Return 0 if platform is 32-bit, 1 if otherwise function is_32bit { [ $(getconf LONG_BIT) = "32" ] } # Determine if kmemleak is enabled # # Return 0 if kmemleak is enabled, 1 if otherwise function is_kmemleak { is_linux && [ -e /sys/kernel/debug/kmemleak ] } # Determine whether a dataset is mounted # # $1 dataset name # $2 filesystem type; optional - defaulted to zfs # # Return 0 if dataset is mounted; 1 if unmounted; 2 on error function ismounted { typeset fstype=$2 [[ -z $fstype ]] && fstype=zfs typeset out dir name case $fstype in zfs) if [[ "$1" == "/"* ]] ; then ! zfs mount | awk -v fs="$1" '$2 == fs {exit 1}' else ! zfs mount | awk -v ds="$1" '$1 == ds {exit 1}' fi ;; ufs|nfs) if is_freebsd; then mount -pt $fstype | while read dev dir _t _flags; do [[ "$1" == "$dev" || "$1" == "$dir" ]] && return 0 done else out=$(df -F $fstype $1 2>/dev/null) || return dir=${out%%\(*} dir=${dir%% *} name=${out##*\(} name=${name%%\)*} name=${name%% *} [[ "$1" == "$dir" || "$1" == "$name" ]] && return 0 fi ;; ext*) df -t $fstype $1 > /dev/null 2>&1 ;; zvol) if [[ -L "$ZVOL_DEVDIR/$1" ]]; then link=$(readlink -f $ZVOL_DEVDIR/$1) [[ -n "$link" ]] && \ mount | grep -q "^$link" && \ return 0 fi ;; *) false ;; esac } # Return 0 if a dataset is mounted; 1 otherwise # # $1 dataset name # $2 filesystem type; optional - defaulted to zfs function mounted { ismounted $1 $2 } # Return 0 if a dataset is unmounted; 1 otherwise # # $1 dataset name # $2 filesystem type; optional - defaulted to zfs function unmounted { ! ismounted $1 $2 } function default_setup { default_setup_noexit "$@" log_pass } function default_setup_no_mountpoint { default_setup_noexit "$1" "$2" "$3" "yes" log_pass } # # Given a list of disks, setup storage pools and datasets. # function default_setup_noexit { typeset disklist=$1 typeset container=$2 typeset volume=$3 typeset no_mountpoint=$4 log_note begin default_setup_noexit if is_global_zone; then if poolexists $TESTPOOL ; then destroy_pool $TESTPOOL fi [[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL log_must zpool create -f $TESTPOOL $disklist else reexport_pool fi rm -rf $TESTDIR || log_unresolved Could not remove $TESTDIR mkdir -p $TESTDIR || log_unresolved Could not create $TESTDIR log_must zfs create $TESTPOOL/$TESTFS if [[ -z $no_mountpoint ]]; then log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS fi if [[ -n $container ]]; then rm -rf $TESTDIR1 || \ log_unresolved Could not remove $TESTDIR1 mkdir -p $TESTDIR1 || \ log_unresolved Could not create $TESTDIR1 log_must zfs create $TESTPOOL/$TESTCTR log_must zfs set canmount=off $TESTPOOL/$TESTCTR log_must zfs create $TESTPOOL/$TESTCTR/$TESTFS1 if [[ -z $no_mountpoint ]]; then log_must zfs set mountpoint=$TESTDIR1 \ $TESTPOOL/$TESTCTR/$TESTFS1 fi fi if [[ -n $volume ]]; then if is_global_zone ; then log_must zfs create -V $VOLSIZE $TESTPOOL/$TESTVOL block_device_wait else log_must zfs create $TESTPOOL/$TESTVOL fi fi } # # Given a list of disks, setup a storage pool, file system and # a container. # function default_container_setup { typeset disklist=$1 default_setup "$disklist" "true" } # # Given a list of disks, setup a storage pool,file system # and a volume. # function default_volume_setup { typeset disklist=$1 default_setup "$disklist" "" "true" } # # Given a list of disks, setup a storage pool,file system, # a container and a volume. # function default_container_volume_setup { typeset disklist=$1 default_setup "$disklist" "true" "true" } # # Create a snapshot on a filesystem or volume. Defaultly create a snapshot on # filesystem # # $1 Existing filesystem or volume name. Default, $TESTPOOL/$TESTFS # $2 snapshot name. Default, $TESTSNAP # function create_snapshot { typeset fs_vol=${1:-$TESTPOOL/$TESTFS} typeset snap=${2:-$TESTSNAP} [[ -z $fs_vol ]] && log_fail "Filesystem or volume's name is undefined." [[ -z $snap ]] && log_fail "Snapshot's name is undefined." if snapexists $fs_vol@$snap; then log_fail "$fs_vol@$snap already exists." fi datasetexists $fs_vol || \ log_fail "$fs_vol must exist." log_must zfs snapshot $fs_vol@$snap } # # Create a clone from a snapshot, default clone name is $TESTCLONE. # # $1 Existing snapshot, $TESTPOOL/$TESTFS@$TESTSNAP is default. # $2 Clone name, $TESTPOOL/$TESTCLONE is default. # function create_clone # snapshot clone { typeset snap=${1:-$TESTPOOL/$TESTFS@$TESTSNAP} typeset clone=${2:-$TESTPOOL/$TESTCLONE} [[ -z $snap ]] && \ log_fail "Snapshot name is undefined." [[ -z $clone ]] && \ log_fail "Clone name is undefined." log_must zfs clone $snap $clone } # # Create a bookmark of the given snapshot. Defaultly create a bookmark on # filesystem. # # $1 Existing filesystem or volume name. Default, $TESTFS # $2 Existing snapshot name. Default, $TESTSNAP # $3 bookmark name. Default, $TESTBKMARK # function create_bookmark { typeset fs_vol=${1:-$TESTFS} typeset snap=${2:-$TESTSNAP} typeset bkmark=${3:-$TESTBKMARK} [[ -z $fs_vol ]] && log_fail "Filesystem or volume's name is undefined." [[ -z $snap ]] && log_fail "Snapshot's name is undefined." [[ -z $bkmark ]] && log_fail "Bookmark's name is undefined." if bkmarkexists $fs_vol#$bkmark; then log_fail "$fs_vol#$bkmark already exists." fi datasetexists $fs_vol || \ log_fail "$fs_vol must exist." snapexists $fs_vol@$snap || \ log_fail "$fs_vol@$snap must exist." log_must zfs bookmark $fs_vol@$snap $fs_vol#$bkmark } # # Create a temporary clone result of an interrupted resumable 'zfs receive' # $1 Destination filesystem name. Must not exist, will be created as the result # of this function along with its %recv temporary clone # $2 Source filesystem name. Must not exist, will be created and destroyed # function create_recv_clone { typeset recvfs="$1" typeset sendfs="${2:-$TESTPOOL/create_recv_clone}" typeset snap="$sendfs@snap1" typeset incr="$sendfs@snap2" typeset mountpoint="$TESTDIR/create_recv_clone" typeset sendfile="$TESTDIR/create_recv_clone.zsnap" [[ -z $recvfs ]] && log_fail "Recv filesystem's name is undefined." datasetexists $recvfs && log_fail "Recv filesystem must not exist." datasetexists $sendfs && log_fail "Send filesystem must not exist." log_must zfs create -o compression=off -o mountpoint="$mountpoint" $sendfs log_must zfs snapshot $snap log_must eval "zfs send $snap | zfs recv -u $recvfs" log_must mkfile 1m "$mountpoint/data" log_must zfs snapshot $incr log_must eval "zfs send -i $snap $incr | dd bs=10K count=1 \ iflag=fullblock > $sendfile" log_mustnot eval "zfs recv -su $recvfs < $sendfile" destroy_dataset "$sendfs" "-r" log_must rm -f "$sendfile" if [[ $(get_prop 'inconsistent' "$recvfs/%recv") -ne 1 ]]; then log_fail "Error creating temporary $recvfs/%recv clone" fi } function default_mirror_setup { default_mirror_setup_noexit $1 $2 $3 log_pass } # # Given a pair of disks, set up a storage pool and dataset for the mirror # @parameters: $1 the primary side of the mirror # $2 the secondary side of the mirror # @uses: ZPOOL ZFS TESTPOOL TESTFS function default_mirror_setup_noexit { readonly func="default_mirror_setup_noexit" typeset primary=$1 typeset secondary=$2 [[ -z $primary ]] && \ log_fail "$func: No parameters passed" [[ -z $secondary ]] && \ log_fail "$func: No secondary partition passed" [[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL log_must zpool create -f $TESTPOOL mirror $@ log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS } # # Destroy the configured testpool mirrors. # the mirrors are of the form ${TESTPOOL}{number} # @uses: ZPOOL ZFS TESTPOOL function destroy_mirrors { default_cleanup_noexit log_pass } function default_raidz_setup { default_raidz_setup_noexit "$*" log_pass } # # Given a minimum of two disks, set up a storage pool and dataset for the raid-z # $1 the list of disks # function default_raidz_setup_noexit { typeset disklist="$*" disks=(${disklist[*]}) if [[ ${#disks[*]} -lt 2 ]]; then log_fail "A raid-z requires a minimum of two disks." fi [[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL log_must zpool create -f $TESTPOOL raidz $disklist log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS } # # Common function used to cleanup storage pools and datasets. # # Invoked at the start of the test suite to ensure the system # is in a known state, and also at the end of each set of # sub-tests to ensure errors from one set of tests doesn't # impact the execution of the next set. function default_cleanup { default_cleanup_noexit log_pass } # # Utility function used to list all available pool names. # # NOTE: $KEEP is a variable containing pool names, separated by a newline # character, that must be excluded from the returned list. # function get_all_pools { zpool list -H -o name | grep -Fvx "$KEEP" | grep -v "$NO_POOLS" } function default_cleanup_noexit { typeset pool="" # # Destroying the pool will also destroy any # filesystems it contains. # if is_global_zone; then zfs unmount -a > /dev/null 2>&1 ALL_POOLS=$(get_all_pools) # Here, we loop through the pools we're allowed to # destroy, only destroying them if it's safe to do # so. while [ ! -z ${ALL_POOLS} ] do for pool in ${ALL_POOLS} do if safe_to_destroy_pool $pool ; then destroy_pool $pool fi done ALL_POOLS=$(get_all_pools) done zfs mount -a else typeset fs="" for fs in $(zfs list -H -o name \ | grep "^$ZONE_POOL/$ZONE_CTR[01234]/"); do destroy_dataset "$fs" "-Rf" done # Need cleanup here to avoid garbage dir left. for fs in $(zfs list -H -o name); do [[ $fs == /$ZONE_POOL ]] && continue [[ -d $fs ]] && log_must rm -rf $fs/* done # # Reset the $ZONE_POOL/$ZONE_CTR[01234] file systems property to # the default value # for fs in $(zfs list -H -o name); do if [[ $fs == $ZONE_POOL/$ZONE_CTR[01234] ]]; then log_must zfs set reservation=none $fs log_must zfs set recordsize=128K $fs log_must zfs set mountpoint=/$fs $fs typeset enc=$(get_prop encryption $fs) if [ -z "$enc" ] || [ "$enc" = "off" ]; then log_must zfs set checksum=on $fs fi log_must zfs set compression=off $fs log_must zfs set atime=on $fs log_must zfs set devices=off $fs log_must zfs set exec=on $fs log_must zfs set setuid=on $fs log_must zfs set readonly=off $fs log_must zfs set snapdir=hidden $fs log_must zfs set aclmode=groupmask $fs log_must zfs set aclinherit=secure $fs fi done fi [[ -d $TESTDIR ]] && \ log_must rm -rf $TESTDIR disk1=${DISKS%% *} if is_mpath_device $disk1; then delete_partitions fi rm -f $TEST_BASE_DIR/{err,out} } # # Common function used to cleanup storage pools, file systems # and containers. # function default_container_cleanup { if ! is_global_zone; then reexport_pool fi ismounted $TESTPOOL/$TESTCTR/$TESTFS1 && log_must zfs unmount $TESTPOOL/$TESTCTR/$TESTFS1 destroy_dataset "$TESTPOOL/$TESTCTR/$TESTFS1" "-R" destroy_dataset "$TESTPOOL/$TESTCTR" "-Rf" [[ -e $TESTDIR1 ]] && \ log_must rm -rf $TESTDIR1 default_cleanup } # # Common function used to cleanup snapshot of file system or volume. Default to # delete the file system's snapshot # # $1 snapshot name # function destroy_snapshot { typeset snap=${1:-$TESTPOOL/$TESTFS@$TESTSNAP} if ! snapexists $snap; then log_fail "'$snap' does not exist." fi # # For the sake of the value which come from 'get_prop' is not equal # to the really mountpoint when the snapshot is unmounted. So, firstly # check and make sure this snapshot's been mounted in current system. # typeset mtpt="" if ismounted $snap; then mtpt=$(get_prop mountpoint $snap) fi destroy_dataset "$snap" [[ $mtpt != "" && -d $mtpt ]] && \ log_must rm -rf $mtpt } # # Common function used to cleanup clone. # # $1 clone name # function destroy_clone { typeset clone=${1:-$TESTPOOL/$TESTCLONE} if ! datasetexists $clone; then log_fail "'$clone' does not existed." fi # With the same reason in destroy_snapshot typeset mtpt="" if ismounted $clone; then mtpt=$(get_prop mountpoint $clone) fi destroy_dataset "$clone" [[ $mtpt != "" && -d $mtpt ]] && \ log_must rm -rf $mtpt } # # Common function used to cleanup bookmark of file system or volume. Default # to delete the file system's bookmark. # # $1 bookmark name # function destroy_bookmark { typeset bkmark=${1:-$TESTPOOL/$TESTFS#$TESTBKMARK} if ! bkmarkexists $bkmark; then log_fail "'$bkmarkp' does not existed." fi destroy_dataset "$bkmark" } # Return 0 if a snapshot exists; $? otherwise # # $1 - snapshot name function snapexists { zfs list -H -t snapshot "$1" > /dev/null 2>&1 } # # Return 0 if a bookmark exists; $? otherwise # # $1 - bookmark name # function bkmarkexists { zfs list -H -t bookmark "$1" > /dev/null 2>&1 } # # Return 0 if a hold exists; $? otherwise # # $1 - hold tag # $2 - snapshot name # function holdexists { ! zfs holds "$2" | awk -v t="$1" '$2 ~ t { exit 1 }' } # # Set a property to a certain value on a dataset. # Sets a property of the dataset to the value as passed in. # @param: # $1 dataset who's property is being set # $2 property to set # $3 value to set property to # @return: # 0 if the property could be set. # non-zero otherwise. # @use: ZFS # function dataset_setprop { typeset fn=dataset_setprop if (($# < 3)); then log_note "$fn: Insufficient parameters (need 3, had $#)" return 1 fi typeset output= output=$(zfs set $2=$3 $1 2>&1) typeset rv=$? if ((rv != 0)); then log_note "Setting property on $1 failed." log_note "property $2=$3" log_note "Return Code: $rv" log_note "Output: $output" return $rv fi return 0 } # # Check a numeric assertion # @parameter: $@ the assertion to check # @output: big loud notice if assertion failed # @use: log_fail # function assert { (($@)) || log_fail "$@" } # # Function to format partition size of a disk # Given a disk cxtxdx reduces all partitions # to 0 size # function zero_partitions # { typeset diskname=$1 typeset i if is_freebsd; then gpart destroy -F $diskname elif is_linux; then DSK=$DEV_DSKDIR/$diskname DSK=$(echo $DSK | sed -e "s|//|/|g") log_must parted $DSK -s -- mklabel gpt blockdev --rereadpt $DSK 2>/dev/null block_device_wait else for i in 0 1 3 4 5 6 7 do log_must set_partition $i "" 0mb $diskname done fi return 0 } # # Given a slice, size and disk, this function # formats the slice to the specified size. # Size should be specified with units as per # the `format` command requirements eg. 100mb 3gb # # NOTE: This entire interface is problematic for the Linux parted utility # which requires the end of the partition to be specified. It would be # best to retire this interface and replace it with something more flexible. # At the moment a best effort is made. # # arguments: function set_partition { typeset -i slicenum=$1 typeset start=$2 typeset size=$3 typeset disk=${4#$DEV_DSKDIR/} disk=${disk#$DEV_RDSKDIR/} case "$UNAME" in Linux) if [[ -z $size || -z $disk ]]; then log_fail "The size or disk name is unspecified." fi disk=$DEV_DSKDIR/$disk typeset size_mb=${size%%[mMgG]} size_mb=${size_mb%%[mMgG][bB]} if [[ ${size:1:1} == 'g' ]]; then ((size_mb = size_mb * 1024)) fi # Create GPT partition table when setting slice 0 or # when the device doesn't already contain a GPT label. parted $disk -s -- print 1 >/dev/null typeset ret_val=$? if [[ $slicenum -eq 0 || $ret_val -ne 0 ]]; then if ! parted $disk -s -- mklabel gpt; then log_note "Failed to create GPT partition table on $disk" return 1 fi fi # When no start is given align on the first cylinder. if [[ -z "$start" ]]; then start=1 fi # Determine the cylinder size for the device and using # that calculate the end offset in cylinders. typeset -i cly_size_kb=0 cly_size_kb=$(parted -m $disk -s -- unit cyl print | awk -F '[:k.]' 'NR == 3 {print $4}') ((end = (size_mb * 1024 / cly_size_kb) + start)) parted $disk -s -- \ mkpart part$slicenum ${start}cyl ${end}cyl typeset ret_val=$? if [[ $ret_val -ne 0 ]]; then log_note "Failed to create partition $slicenum on $disk" return 1 fi blockdev --rereadpt $disk 2>/dev/null block_device_wait $disk ;; FreeBSD) if [[ -z $size || -z $disk ]]; then log_fail "The size or disk name is unspecified." fi disk=$DEV_DSKDIR/$disk if [[ $slicenum -eq 0 ]] || ! gpart show $disk >/dev/null 2>&1; then gpart destroy -F $disk >/dev/null 2>&1 if ! gpart create -s GPT $disk; then log_note "Failed to create GPT partition table on $disk" return 1 fi fi typeset index=$((slicenum + 1)) if [[ -n $start ]]; then start="-b $start" fi gpart add -t freebsd-zfs $start -s $size -i $index $disk if [[ $ret_val -ne 0 ]]; then log_note "Failed to create partition $slicenum on $disk" return 1 fi block_device_wait $disk ;; *) if [[ -z $slicenum || -z $size || -z $disk ]]; then log_fail "The slice, size or disk name is unspecified." fi typeset format_file=/var/tmp/format_in.$$ echo "partition" >$format_file echo "$slicenum" >> $format_file echo "" >> $format_file echo "" >> $format_file echo "$start" >> $format_file echo "$size" >> $format_file echo "label" >> $format_file echo "" >> $format_file echo "q" >> $format_file echo "q" >> $format_file format -e -s -d $disk -f $format_file typeset ret_val=$? rm -f $format_file ;; esac if [[ $ret_val -ne 0 ]]; then log_note "Unable to format $disk slice $slicenum to $size" return 1 fi return 0 } # # Delete all partitions on all disks - this is specifically for the use of multipath # devices which currently can only be used in the test suite as raw/un-partitioned # devices (ie a zpool cannot be created on a whole mpath device that has partitions) # function delete_partitions { typeset disk if [[ -z $DISKSARRAY ]]; then DISKSARRAY=$DISKS fi if is_linux; then typeset -i part for disk in $DISKSARRAY; do for (( part = 1; part < MAX_PARTITIONS; part++ )); do typeset partition=${disk}${SLICE_PREFIX}${part} parted $DEV_DSKDIR/$disk -s rm $part > /dev/null 2>&1 if lsblk | grep -qF ${partition}; then log_fail "Partition ${partition} not deleted" else log_note "Partition ${partition} deleted" fi done done elif is_freebsd; then for disk in $DISKSARRAY; do if gpart destroy -F $disk; then log_note "Partitions for ${disk} deleted" else log_fail "Partitions for ${disk} not deleted" fi done fi } # # Get the end cyl of the given slice # function get_endslice # { typeset disk=$1 typeset slice=$2 if [[ -z $disk || -z $slice ]] ; then log_fail "The disk name or slice number is unspecified." fi case "$UNAME" in Linux) endcyl=$(parted -s $DEV_DSKDIR/$disk -- unit cyl print | \ awk "/part${slice}/"' {sub(/cyl/, "", $3); print $3}') ((endcyl = (endcyl + 1))) ;; FreeBSD) disk=${disk#/dev/zvol/} disk=${disk%p*} slice=$((slice + 1)) endcyl=$(gpart show $disk | \ awk -v slice=$slice '$3 == slice { print $1 + $2 }') ;; *) disk=${disk#/dev/dsk/} disk=${disk#/dev/rdsk/} disk=${disk%s*} typeset -i ratio=0 ratio=$(prtvtoc /dev/rdsk/${disk}s2 | \ awk '/sectors\/cylinder/ {print $2}') if ((ratio == 0)); then return fi typeset -i endcyl=$(prtvtoc -h /dev/rdsk/${disk}s2 | awk -v token="$slice" '$1 == token {print $6}') ((endcyl = (endcyl + 1) / ratio)) ;; esac echo $endcyl } # # Given a size,disk and total slice number, this function formats the # disk slices from 0 to the total slice number with the same specified # size. # function partition_disk # { typeset -i i=0 typeset slice_size=$1 typeset disk_name=$2 typeset total_slices=$3 typeset cyl zero_partitions $disk_name while ((i < $total_slices)); do if ! is_linux; then if ((i == 2)); then ((i = i + 1)) continue fi fi log_must set_partition $i "$cyl" $slice_size $disk_name cyl=$(get_endslice $disk_name $i) ((i = i+1)) done } # # This function continues to write to a filenum number of files into dirnum # number of directories until either file_write returns an error or the # maximum number of files per directory have been written. # # Usage: # fill_fs [destdir] [dirnum] [filenum] [bytes] [num_writes] [data] # # Return value: 0 on success # non 0 on error # # Where : # destdir: is the directory where everything is to be created under # dirnum: the maximum number of subdirectories to use, -1 no limit # filenum: the maximum number of files per subdirectory # bytes: number of bytes to write # num_writes: number of types to write out bytes # data: the data that will be written # # E.g. # fill_fs /testdir 20 25 1024 256 0 # # Note: bytes * num_writes equals the size of the testfile # function fill_fs # destdir dirnum filenum bytes num_writes data { typeset destdir=${1:-$TESTDIR} typeset -i dirnum=${2:-50} typeset -i filenum=${3:-50} typeset -i bytes=${4:-8192} typeset -i num_writes=${5:-10240} typeset data=${6:-0} mkdir -p $destdir/{1..$dirnum} for f in $destdir/{1..$dirnum}/$TESTFILE{1..$filenum}; do file_write -o create -f $f -b $bytes -c $num_writes -d $data \ || return done } # Get the specified dataset property in parsable format or fail function get_prop # property dataset { typeset prop=$1 typeset dataset=$2 zfs get -Hpo value "$prop" "$dataset" || log_fail "zfs get $prop $dataset" } # Get the specified pool property in parsable format or fail function get_pool_prop # property pool { typeset prop=$1 typeset pool=$2 zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool" } # Return 0 if a pool exists; $? otherwise # # $1 - pool name function poolexists { typeset pool=$1 if [[ -z $pool ]]; then log_note "No pool name given." return 1 fi zpool get name "$pool" > /dev/null 2>&1 } # Return 0 if all the specified datasets exist; $? otherwise # # $1-n dataset name function datasetexists { if (($# == 0)); then log_note "No dataset name given." return 1 fi zfs get name "$@" > /dev/null 2>&1 } # return 0 if none of the specified datasets exists, otherwise return 1. # # $1-n dataset name function datasetnonexists { if (($# == 0)); then log_note "No dataset name given." return 1 fi while (($# > 0)); do zfs list -H -t filesystem,snapshot,volume $1 > /dev/null 2>&1 \ && return 1 shift done return 0 } # FreeBSD breaks exports(5) at whitespace and doesn't process escapes # Solaris just breaks # # cf. https://github.com/openzfs/zfs/pull/13165#issuecomment-1059845807 # # Linux can have spaces (which are \OOO-escaped), # but can't have backslashes because they're parsed recursively function shares_can_have_whitespace { is_linux } function is_shared_freebsd { typeset fs=$1 pgrep -q mountd && showmount -E | grep -qx "$fs" } function is_shared_illumos { typeset fs=$1 typeset mtpt for mtpt in `share | awk '{print $2}'` ; do if [[ $mtpt == $fs ]] ; then return 0 fi done typeset stat=$(svcs -H -o STA nfs/server:default) if [[ $stat != "ON" ]]; then log_note "Current nfs/server status: $stat" fi return 1 } function is_shared_linux { typeset fs=$1 ! exportfs -s | awk -v fs="${fs//\\/\\\\}" '/^\// && $1 == fs {exit 1}' } # # Given a mountpoint, or a dataset name, determine if it is shared via NFS. # # Returns 0 if shared, 1 otherwise. # function is_shared { typeset fs=$1 typeset mtpt if [[ $fs != "/"* ]] ; then if datasetnonexists "$fs" ; then return 1 else mtpt=$(get_prop mountpoint "$fs") case "$mtpt" in none|legacy|-) return 1 ;; *) fs=$mtpt ;; esac fi fi case "$UNAME" in FreeBSD) is_shared_freebsd "$fs" ;; Linux) is_shared_linux "$fs" ;; *) is_shared_illumos "$fs" ;; esac } function is_exported_illumos { typeset fs=$1 typeset mtpt _ while read -r mtpt _; do [ "$mtpt" = "$fs" ] && return done < /etc/dfs/sharetab return 1 } function is_exported_freebsd { typeset fs=$1 typeset mtpt _ while read -r mtpt _; do [ "$mtpt" = "$fs" ] && return done < /etc/zfs/exports return 1 } function is_exported_linux { typeset fs=$1 typeset mtpt _ while read -r mtpt _; do [ "$(printf "$mtpt")" = "$fs" ] && return done < /etc/exports.d/zfs.exports return 1 } # # Given a mountpoint, or a dataset name, determine if it is exported via # the os-specific NFS exports file. # # Returns 0 if exported, 1 otherwise. # function is_exported { typeset fs=$1 typeset mtpt if [[ $fs != "/"* ]] ; then if datasetnonexists "$fs" ; then return 1 else mtpt=$(get_prop mountpoint "$fs") case $mtpt in none|legacy|-) return 1 ;; *) fs=$mtpt ;; esac fi fi case "$UNAME" in FreeBSD) is_exported_freebsd "$fs" ;; Linux) is_exported_linux "$fs" ;; *) is_exported_illumos "$fs" ;; esac } # # Given a dataset name determine if it is shared via SMB. # # Returns 0 if shared, 1 otherwise. # function is_shared_smb { typeset fs=$1 datasetexists "$fs" || return if is_linux; then net usershare list | grep -xFq "${fs//[-\/]/_}" else log_note "SMB on $UNAME currently unsupported by the test framework" return 1 fi } # # Given a mountpoint, determine if it is not shared via NFS. # # Returns 0 if not shared, 1 otherwise. # function not_shared { ! is_shared $1 } # # Given a dataset determine if it is not shared via SMB. # # Returns 0 if not shared, 1 otherwise. # function not_shared_smb { ! is_shared_smb $1 } # # Helper function to unshare a mountpoint. # function unshare_fs #fs { typeset fs=$1 if is_shared $fs || is_shared_smb $fs; then log_must zfs unshare $fs fi } # # Helper function to share a NFS mountpoint. # function share_nfs #fs { typeset fs=$1 is_shared "$fs" && return case "$UNAME" in Linux) log_must exportfs "*:$fs" ;; FreeBSD) typeset mountd read -r mountd < /var/run/mountd.pid log_must eval "printf '%s\t\n' \"$fs\" >> /etc/zfs/exports" log_must kill -s HUP "$mountd" ;; *) log_must share -F nfs "$fs" ;; esac return 0 } # # Helper function to unshare a NFS mountpoint. # function unshare_nfs #fs { typeset fs=$1 ! is_shared "$fs" && return case "$UNAME" in Linux) log_must exportfs -u "*:$fs" ;; FreeBSD) typeset mountd read -r mountd < /var/run/mountd.pid awk -v fs="${fs//\\/\\\\}" '$1 != fs' /etc/zfs/exports > /etc/zfs/exports.$$ log_must mv /etc/zfs/exports.$$ /etc/zfs/exports log_must kill -s HUP "$mountd" ;; *) log_must unshare -F nfs $fs ;; esac return 0 } # # Helper function to show NFS shares. # function showshares_nfs { case "$UNAME" in Linux) exportfs -v ;; FreeBSD) showmount ;; *) share -F nfs ;; esac } function check_nfs { case "$UNAME" in Linux) exportfs -s ;; FreeBSD) showmount -e ;; *) log_unsupported "Unknown platform" ;; esac || log_unsupported "The NFS utilities are not installed" } # # Check NFS server status and trigger it online. # function setup_nfs_server { # Cannot share directory in non-global zone. # if ! is_global_zone; then log_note "Cannot trigger NFS server by sharing in LZ." return fi if is_linux; then # # Re-synchronize /var/lib/nfs/etab with /etc/exports and # /etc/exports.d./* to provide a clean test environment. # log_must exportfs -r log_note "NFS server must be started prior to running ZTS." return elif is_freebsd; then log_must kill -s HUP $(/dev/null) [ $cur_zone = "global" ] fi } # # Verify whether test is permitted to run from # global zone, local zone, or both # # $1 zone limit, could be "global", "local", or "both"(no limit) # # Return 0 if permitted, otherwise exit with log_unsupported # function verify_runnable # zone limit { typeset limit=$1 [[ -z $limit ]] && return 0 if is_global_zone ; then case $limit in global|both) ;; local) log_unsupported "Test is unable to run from "\ "global zone." ;; *) log_note "Warning: unknown limit $limit - " \ "use both." ;; esac else case $limit in local|both) ;; global) log_unsupported "Test is unable to run from "\ "local zone." ;; *) log_note "Warning: unknown limit $limit - " \ "use both." ;; esac reexport_pool fi return 0 } # Return 0 if create successfully or the pool exists; $? otherwise # Note: In local zones, this function should return 0 silently. # # $1 - pool name # $2-n - [keyword] devs_list function create_pool #pool devs_list { typeset pool=${1%%/*} shift if [[ -z $pool ]]; then log_note "Missing pool name." return 1 fi if poolexists $pool ; then destroy_pool $pool fi if is_global_zone ; then [[ -d /$pool ]] && rm -rf /$pool log_must zpool create -f $pool $@ fi return 0 } # Return 0 if destroy successfully or the pool exists; $? otherwise # Note: In local zones, this function should return 0 silently. # # $1 - pool name # Destroy pool with the given parameters. function destroy_pool #pool { typeset pool=${1%%/*} typeset mtpt if [[ -z $pool ]]; then log_note "No pool name given." return 1 fi if is_global_zone ; then if poolexists "$pool" ; then mtpt=$(get_prop mountpoint "$pool") # At times, syseventd/udev activity can cause attempts # to destroy a pool to fail with EBUSY. We retry a few # times allowing failures before requiring the destroy # to succeed. log_must_busy zpool destroy -f $pool [[ -d $mtpt ]] && \ log_must rm -rf $mtpt else log_note "Pool does not exist. ($pool)" return 1 fi fi return 0 } # Return 0 if created successfully; $? otherwise # # $1 - dataset name # $2-n - dataset options function create_dataset #dataset dataset_options { typeset dataset=$1 shift if [[ -z $dataset ]]; then log_note "Missing dataset name." return 1 fi if datasetexists $dataset ; then destroy_dataset $dataset fi log_must zfs create $@ $dataset return 0 } # Return 0 if destroy successfully or the dataset exists; $? otherwise # Note: In local zones, this function should return 0 silently. # # $1 - dataset name # $2 - custom arguments for zfs destroy # Destroy dataset with the given parameters. function destroy_dataset # dataset [args] { typeset dataset=$1 typeset mtpt typeset args=${2:-""} if [[ -z $dataset ]]; then log_note "No dataset name given." return 1 fi if is_global_zone ; then if datasetexists "$dataset" ; then mtpt=$(get_prop mountpoint "$dataset") log_must_busy zfs destroy $args $dataset [ -d $mtpt ] && log_must rm -rf $mtpt else log_note "Dataset does not exist. ($dataset)" return 1 fi fi return 0 } # # Reexport TESTPOOL & TESTPOOL(1-4) # function reexport_pool { typeset -i cntctr=5 typeset -i i=0 while ((i < cntctr)); do if ((i == 0)); then TESTPOOL=$ZONE_POOL/$ZONE_CTR$i if ! ismounted $TESTPOOL; then log_must zfs mount $TESTPOOL fi else eval TESTPOOL$i=$ZONE_POOL/$ZONE_CTR$i if eval ! ismounted \$TESTPOOL$i; then log_must eval zfs mount \$TESTPOOL$i fi fi ((i += 1)) done } # # Verify a given disk or pool state # # Return 0 is pool/disk matches expected state, 1 otherwise # function check_state # pool disk state{online,offline,degraded} { typeset pool=$1 typeset disk=${2#$DEV_DSKDIR/} typeset state=$3 [[ -z $pool ]] || [[ -z $state ]] \ && log_fail "Arguments invalid or missing" if [[ -z $disk ]]; then #check pool state only zpool get -H -o value health $pool | grep -qi "$state" else zpool status -v $pool | grep "$disk" | grep -qi "$state" fi } # # Get the mountpoint of snapshot # For the snapshot use /.zfs/snapshot/ # as its mountpoint # function snapshot_mountpoint { typeset dataset=${1:-$TESTPOOL/$TESTFS@$TESTSNAP} if [[ $dataset != *@* ]]; then log_fail "Error name of snapshot '$dataset'." fi typeset fs=${dataset%@*} typeset snap=${dataset#*@} if [[ -z $fs || -z $snap ]]; then log_fail "Error name of snapshot '$dataset'." fi echo $(get_prop mountpoint $fs)/.zfs/snapshot/$snap } # # Given a device and 'ashift' value verify it's correctly set on every label # function verify_ashift # device ashift { typeset device="$1" typeset ashift="$2" zdb -e -lll $device | awk -v ashift=$ashift ' /ashift: / { if (ashift != $2) exit 1; else count++; } END { exit (count != 4); }' } # # Given a pool and file system, this function will verify the file system # using the zdb internal tool. Note that the pool is exported and imported # to ensure it has consistent state. # function verify_filesys # pool filesystem dir { typeset pool="$1" typeset filesys="$2" typeset zdbout="/tmp/zdbout.$$" shift shift typeset dirs=$@ typeset search_path="" log_note "Calling zdb to verify filesystem '$filesys'" zfs unmount -a > /dev/null 2>&1 log_must zpool export $pool if [[ -n $dirs ]] ; then for dir in $dirs ; do search_path="$search_path -d $dir" done fi log_must zpool import $search_path $pool if ! zdb -cudi $filesys > $zdbout 2>&1; then log_note "Output: zdb -cudi $filesys" cat $zdbout rm -f $zdbout log_fail "zdb detected errors with: '$filesys'" fi log_must zfs mount -a log_must rm -rf $zdbout } # # Given a pool issue a scrub and verify that no checksum errors are reported. # function verify_pool { typeset pool=${1:-$TESTPOOL} log_must zpool scrub $pool log_must wait_scrubbed $pool typeset -i cksum=$(zpool status $pool | awk ' !NF { isvdev = 0 } isvdev { errors += $NF } /CKSUM$/ { isvdev = 1 } END { print errors } ') if [[ $cksum != 0 ]]; then log_must zpool status -v log_fail "Unexpected CKSUM errors found on $pool ($cksum)" fi } # # Given a pool, and this function list all disks in the pool # function get_disklist # pool { echo $(zpool iostat -v $1 | awk '(NR > 4) {print $1}' | \ grep -vEe '^-----' -e "^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|\-[0-9]$") } # # Given a pool, and this function list all disks in the pool with their full # path (like "/dev/sda" instead of "sda"). # function get_disklist_fullpath # pool { get_disklist "-P $1" } # /** # This function kills a given list of processes after a time period. We use # this in the stress tests instead of STF_TIMEOUT so that we can have processes # run for a fixed amount of time, yet still pass. Tests that hit STF_TIMEOUT # would be listed as FAIL, which we don't want : we're happy with stress tests # running for a certain amount of time, then finishing. # # @param $1 the time in seconds after which we should terminate these processes # @param $2..$n the processes we wish to terminate. # */ function stress_timeout { typeset -i TIMEOUT=$1 shift typeset cpids="$@" log_note "Waiting for child processes($cpids). " \ "It could last dozens of minutes, please be patient ..." log_must sleep $TIMEOUT log_note "Killing child processes after ${TIMEOUT} stress timeout." typeset pid for pid in $cpids; do ps -p $pid > /dev/null 2>&1 && log_must kill -USR1 $pid done } # # Verify a given hotspare disk is inuse or avail # # Return 0 is pool/disk matches expected state, 1 otherwise # function check_hotspare_state # pool disk state{inuse,avail} { typeset pool=$1 typeset disk=${2#$DEV_DSKDIR/} typeset state=$3 cur_state=$(get_device_state $pool $disk "spares") [ $state = $cur_state ] } # # Wait until a hotspare transitions to a given state or times out. # # Return 0 when pool/disk matches expected state, 1 on timeout. # function wait_hotspare_state # pool disk state timeout { typeset pool=$1 typeset disk=${2#*$DEV_DSKDIR/} typeset state=$3 typeset timeout=${4:-60} typeset -i i=0 while [[ $i -lt $timeout ]]; do if check_hotspare_state $pool $disk $state; then return 0 fi i=$((i+1)) sleep 1 done return 1 } # # Verify a given vdev disk is inuse or avail # # Return 0 is pool/disk matches expected state, 1 otherwise # function check_vdev_state # pool disk state{online,offline,unavail,removed} { typeset pool=$1 typeset disk=${2#*$DEV_DSKDIR/} typeset state=$3 cur_state=$(get_device_state $pool $disk) [ $state = $cur_state ] } # # Wait until a vdev transitions to a given state or times out. # # Return 0 when pool/disk matches expected state, 1 on timeout. # function wait_vdev_state # pool disk state timeout { typeset pool=$1 typeset disk=${2#*$DEV_DSKDIR/} typeset state=$3 typeset timeout=${4:-60} typeset -i i=0 while [[ $i -lt $timeout ]]; do if check_vdev_state $pool $disk $state; then return 0 fi i=$((i+1)) sleep 1 done return 1 } # # Check the output of 'zpool status -v ', # and to see if the content of contain the specified. # # Return 0 is contain, 1 otherwise # function check_pool_status # pool token keyword { typeset pool=$1 typeset token=$2 typeset keyword=$3 typeset verbose=${4:-false} scan=$(zpool status -v "$pool" 2>/dev/null | awk -v token="$token:" '$1==token') if [[ $verbose == true ]]; then log_note $scan fi echo $scan | grep -qi "$keyword" } # # The following functions are instance of check_pool_status() # is_pool_resilvering - to check if the pool resilver is in progress # is_pool_resilvered - to check if the pool resilver is completed # is_pool_scrubbing - to check if the pool scrub is in progress # is_pool_scrubbed - to check if the pool scrub is completed # is_pool_scrub_stopped - to check if the pool scrub is stopped # is_pool_scrub_paused - to check if the pool scrub has paused # is_pool_removing - to check if the pool removing is a vdev # is_pool_removed - to check if the pool remove is completed # is_pool_discarding - to check if the pool checkpoint is being discarded # is_pool_replacing - to check if the pool is performing a replacement # function is_pool_resilvering #pool { check_pool_status "$1" "scan" \ "resilver[ ()0-9A-Za-z:_-]* in progress since" $2 } function is_pool_resilvered #pool { check_pool_status "$1" "scan" "resilvered " $2 } function is_pool_scrubbing #pool { check_pool_status "$1" "scan" "scrub in progress since " $2 } function is_pool_error_scrubbing #pool { check_pool_status "$1" "scrub" "error scrub in progress since " $2 return $? } function is_pool_scrubbed #pool { check_pool_status "$1" "scan" "scrub repaired" $2 } function is_pool_scrub_stopped #pool { check_pool_status "$1" "scan" "scrub canceled" $2 } function is_pool_error_scrub_stopped #pool { check_pool_status "$1" "scrub" "error scrub canceled on " $2 return $? } function is_pool_scrub_paused #pool { check_pool_status "$1" "scan" "scrub paused since " $2 } function is_pool_error_scrub_paused #pool { check_pool_status "$1" "scrub" "error scrub paused since " $2 return $? } function is_pool_removing #pool { check_pool_status "$1" "remove" "in progress since " } function is_pool_removed #pool { check_pool_status "$1" "remove" "completed on" } function is_pool_discarding #pool { check_pool_status "$1" "checkpoint" "discarding" } function is_pool_replacing #pool { zpool status "$1" | grep -qE 'replacing-[0-9]+' } function wait_for_degraded { typeset pool=$1 typeset timeout=${2:-30} typeset t0=$SECONDS while :; do [[ $(get_pool_prop health $pool) == "DEGRADED" ]] && break log_note "$pool is not yet degraded." sleep 1 if ((SECONDS - t0 > $timeout)); then log_note "$pool not degraded after $timeout seconds." return 1 fi done return 0 } # # Use create_pool()/destroy_pool() to clean up the information in # in the given disk to avoid slice overlapping. # function cleanup_devices #vdevs { typeset pool="foopool$$" for vdev in $@; do zero_partitions $vdev done poolexists $pool && destroy_pool $pool create_pool $pool $@ destroy_pool $pool return 0 } #/** # A function to find and locate free disks on a system or from given # disks as the parameter. It works by locating disks that are in use # as swap devices and dump devices, and also disks listed in /etc/vfstab # # $@ given disks to find which are free, default is all disks in # the test system # # @return a string containing the list of available disks #*/ function find_disks { # Trust provided list, no attempt is made to locate unused devices. if is_linux || is_freebsd; then echo "$@" return fi sfi=/tmp/swaplist.$$ dmpi=/tmp/dumpdev.$$ max_finddisksnum=${MAX_FINDDISKSNUM:-6} swap -l > $sfi dumpadm > $dmpi 2>/dev/null disks=${@:-$(echo "" | format -e 2>/dev/null | awk ' BEGIN { FS="."; } /^Specify disk/{ searchdisks=0; } { if (searchdisks && $2 !~ "^$"){ split($2,arr," "); print arr[1]; } } /^AVAILABLE DISK SELECTIONS:/{ searchdisks=1; } ')} unused="" for disk in $disks; do # Check for mounted grep -q "${disk}[sp]" /etc/mnttab && continue # Check for swap grep -q "${disk}[sp]" $sfi && continue # check for dump device grep -q "${disk}[sp]" $dmpi && continue # check to see if this disk hasn't been explicitly excluded # by a user-set environment variable echo "${ZFS_HOST_DEVICES_IGNORE}" | grep -q "${disk}" && continue unused_candidates="$unused_candidates $disk" done rm $sfi $dmpi # now just check to see if those disks do actually exist # by looking for a device pointing to the first slice in # each case. limit the number to max_finddisksnum count=0 for disk in $unused_candidates; do if is_disk_device $DEV_DSKDIR/${disk}s0 && \ [ $count -lt $max_finddisksnum ]; then unused="$unused $disk" # do not impose limit if $@ is provided [[ -z $@ ]] && ((count = count + 1)) fi done # finally, return our disk list echo $unused } function add_user_freebsd # { typeset group=$1 typeset user=$2 typeset basedir=$3 # Check to see if the user exists. if id $user > /dev/null 2>&1; then return 0 fi # Assign 1000 as the base uid typeset -i uid=1000 while true; do pw useradd -u $uid -g $group -d $basedir/$user -m -n $user case $? in 0) break ;; # The uid is not unique 65) ((uid += 1)) ;; *) return 1 ;; esac if [[ $uid == 65000 ]]; then log_fail "No user id available under 65000 for $user" fi done # Silence MOTD touch $basedir/$user/.hushlogin return 0 } # # Delete the specified user. # # $1 login name # function del_user_freebsd # { typeset user=$1 if id $user > /dev/null 2>&1; then log_must pw userdel $user fi return 0 } # # Select valid gid and create specified group. # # $1 group name # function add_group_freebsd # { typeset group=$1 # See if the group already exists. if pw groupshow $group >/dev/null 2>&1; then return 0 fi # Assign 1000 as the base gid typeset -i gid=1000 while true; do pw groupadd -g $gid -n $group > /dev/null 2>&1 case $? in 0) return 0 ;; # The gid is not unique 65) ((gid += 1)) ;; *) return 1 ;; esac if [[ $gid == 65000 ]]; then log_fail "No user id available under 65000 for $group" fi done } # # Delete the specified group. # # $1 group name # function del_group_freebsd # { typeset group=$1 pw groupdel -n $group > /dev/null 2>&1 case $? in # Group does not exist, or was deleted successfully. 0|6|65) return 0 ;; # Name already exists as a group name 9) log_must pw groupdel $group ;; *) return 1 ;; esac return 0 } function add_user_illumos # { typeset group=$1 typeset user=$2 typeset basedir=$3 log_must useradd -g $group -d $basedir/$user -m $user return 0 } function del_user_illumos # { typeset user=$1 if id $user > /dev/null 2>&1; then log_must_retry "currently used" 6 userdel $user fi return 0 } function add_group_illumos # { typeset group=$1 typeset -i gid=100 while true; do groupadd -g $gid $group > /dev/null 2>&1 case $? in 0) return 0 ;; # The gid is not unique 4) ((gid += 1)) ;; *) return 1 ;; esac done } function del_group_illumos # { typeset group=$1 groupmod -n $grp $grp > /dev/null 2>&1 case $? in # Group does not exist. 6) return 0 ;; # Name already exists as a group name 9) log_must groupdel $grp ;; *) return 1 ;; esac } function add_user_linux # { typeset group=$1 typeset user=$2 typeset basedir=$3 log_must useradd -g $group -d $basedir/$user -m $user # Add new users to the same group and the command line utils. # This allows them to be run out of the original users home # directory as long as it permissioned to be group readable. cmd_group=$(stat --format="%G" $(command -v zfs)) log_must usermod -a -G $cmd_group $user return 0 } function del_user_linux # { typeset user=$1 if id $user > /dev/null 2>&1; then log_must_retry "currently used" 6 userdel $user fi } function add_group_linux # { typeset group=$1 # Assign 100 as the base gid, a larger value is selected for # Linux because for many distributions 1000 and under are reserved. while true; do groupadd $group > /dev/null 2>&1 case $? in 0) return 0 ;; *) return 1 ;; esac done } function del_group_linux # { typeset group=$1 getent group $group > /dev/null 2>&1 case $? in # Group does not exist. 2) return 0 ;; # Name already exists as a group name 0) log_must groupdel $group ;; *) return 1 ;; esac return 0 } # # Add specified user to specified group # # $1 group name # $2 user name # $3 base of the homedir (optional) # function add_user # { typeset group=$1 typeset user=$2 typeset basedir=${3:-"/var/tmp"} if ((${#group} == 0 || ${#user} == 0)); then log_fail "group name or user name are not defined." fi case "$UNAME" in FreeBSD) add_user_freebsd "$group" "$user" "$basedir" ;; Linux) add_user_linux "$group" "$user" "$basedir" ;; *) add_user_illumos "$group" "$user" "$basedir" ;; esac return 0 } # # Delete the specified user. # # $1 login name # $2 base of the homedir (optional) # function del_user # { typeset user=$1 typeset basedir=${2:-"/var/tmp"} if ((${#user} == 0)); then log_fail "login name is necessary." fi case "$UNAME" in FreeBSD) del_user_freebsd "$user" ;; Linux) del_user_linux "$user" ;; *) del_user_illumos "$user" ;; esac [[ -d $basedir/$user ]] && rm -fr $basedir/$user return 0 } # # Select valid gid and create specified group. # # $1 group name # function add_group # { typeset group=$1 if ((${#group} == 0)); then log_fail "group name is necessary." fi case "$UNAME" in FreeBSD) add_group_freebsd "$group" ;; Linux) add_group_linux "$group" ;; *) add_group_illumos "$group" ;; esac return 0 } # # Delete the specified group. # # $1 group name # function del_group # { typeset group=$1 if ((${#group} == 0)); then log_fail "group name is necessary." fi case "$UNAME" in FreeBSD) del_group_freebsd "$group" ;; Linux) del_group_linux "$group" ;; *) del_group_illumos "$group" ;; esac return 0 } # # This function will return true if it's safe to destroy the pool passed # as argument 1. It checks for pools based on zvols and files, and also # files contained in a pool that may have a different mountpoint. # function safe_to_destroy_pool { # $1 the pool name typeset pool="" typeset DONT_DESTROY="" # We check that by deleting the $1 pool, we're not # going to pull the rug out from other pools. Do this # by looking at all other pools, ensuring that they # aren't built from files or zvols contained in this pool. for pool in $(zpool list -H -o name) do ALTMOUNTPOOL="" # this is a list of the top-level directories in each of the # files that make up the path to the files the pool is based on FILEPOOL=$(zpool status -v $pool | awk -v pool="/$1/" '$0 ~ pool {print $1}') # this is a list of the zvols that make up the pool ZVOLPOOL=$(zpool status -v $pool | awk -v zvols="$ZVOL_DEVDIR/$1$" '$0 ~ zvols {print $1}') # also want to determine if it's a file-based pool using an # alternate mountpoint... POOL_FILE_DIRS=$(zpool status -v $pool | \ awk '/\// {print $1}' | \ awk -F/ '!/dev/ {print $2}') for pooldir in $POOL_FILE_DIRS do OUTPUT=$(zfs list -H -r -o mountpoint $1 | \ awk -v pd="${pooldir}$" '$0 ~ pd {print $1}') ALTMOUNTPOOL="${ALTMOUNTPOOL}${OUTPUT}" done if [ ! -z "$ZVOLPOOL" ] then DONT_DESTROY="true" log_note "Pool $pool is built from $ZVOLPOOL on $1" fi if [ ! -z "$FILEPOOL" ] then DONT_DESTROY="true" log_note "Pool $pool is built from $FILEPOOL on $1" fi if [ ! -z "$ALTMOUNTPOOL" ] then DONT_DESTROY="true" log_note "Pool $pool is built from $ALTMOUNTPOOL on $1" fi done if [ -z "${DONT_DESTROY}" ] then return 0 else log_note "Warning: it is not safe to destroy $1!" return 1 fi } # # Verify zfs operation with -p option work as expected # $1 operation, value could be create, clone or rename # $2 dataset type, value could be fs or vol # $3 dataset name # $4 new dataset name # function verify_opt_p_ops { typeset ops=$1 typeset datatype=$2 typeset dataset=$3 typeset newdataset=$4 if [[ $datatype != "fs" && $datatype != "vol" ]]; then log_fail "$datatype is not supported." fi # check parameters accordingly case $ops in create) newdataset=$dataset dataset="" if [[ $datatype == "vol" ]]; then ops="create -V $VOLSIZE" fi ;; clone) if [[ -z $newdataset ]]; then log_fail "newdataset should not be empty" \ "when ops is $ops." fi log_must datasetexists $dataset log_must snapexists $dataset ;; rename) if [[ -z $newdataset ]]; then log_fail "newdataset should not be empty" \ "when ops is $ops." fi log_must datasetexists $dataset ;; *) log_fail "$ops is not supported." ;; esac # make sure the upper level filesystem does not exist destroy_dataset "${newdataset%/*}" "-rRf" # without -p option, operation will fail log_mustnot zfs $ops $dataset $newdataset log_mustnot datasetexists $newdataset ${newdataset%/*} # with -p option, operation should succeed log_must zfs $ops -p $dataset $newdataset block_device_wait if ! datasetexists $newdataset ; then log_fail "-p option does not work for $ops" fi # when $ops is create or clone, redo the operation still return zero if [[ $ops != "rename" ]]; then log_must zfs $ops -p $dataset $newdataset fi return 0 } # # Get configuration of pool # $1 pool name # $2 config name # function get_config { typeset pool=$1 typeset config=$2 if ! poolexists "$pool" ; then return 1 fi if [ "$(get_pool_prop cachefile "$pool")" = "none" ]; then zdb -e $pool else zdb -C $pool fi | awk -F: -v cfg="$config:" '$0 ~ cfg {sub(/^'\''/, $2); sub(/'\''$/, $2); print $2}' } # # Privated function. Random select one of items from arguments. # # $1 count # $2-n string # function _random_get { typeset cnt=$1 shift typeset str="$@" typeset -i ind ((ind = RANDOM % cnt + 1)) echo "$str" | cut -f $ind -d ' ' } # # Random select one of item from arguments which include NONE string # function random_get_with_non { typeset -i cnt=$# ((cnt =+ 1)) _random_get "$cnt" "$@" } # # Random select one of item from arguments which doesn't include NONE string # function random_get { _random_get "$#" "$@" } # # The function will generate a dataset name with specific length # $1, the length of the name # $2, the base string to construct the name # function gen_dataset_name { typeset -i len=$1 typeset basestr="$2" typeset -i baselen=${#basestr} typeset -i iter=0 typeset l_name="" if ((len % baselen == 0)); then ((iter = len / baselen)) else ((iter = len / baselen + 1)) fi while ((iter > 0)); do l_name="${l_name}$basestr" ((iter -= 1)) done echo $l_name } # # Get cksum tuple of dataset # $1 dataset name # # sample zdb output: # Dataset data/test [ZPL], ID 355, cr_txg 2413856, 31.0K, 7 objects, rootbp # DVA[0]=<0:803046400:200> DVA[1]=<0:81199000:200> [L0 DMU objset] fletcher4 # lzjb LE contiguous unique double size=800L/200P birth=2413856L/2413856P # fill=7 cksum=11ce125712:643a9c18ee2:125e25238fca0:254a3f74b59744 function datasetcksum { typeset cksum sync sync_all_pools zdb -vvv $1 | awk -F= -v ds="^Dataset $1 "'\\[' '$0 ~ ds && /cksum/ {print $7}' } # # Get the given disk/slice state from the specific field of the pool # function get_device_state #pool disk field("", "spares","logs") { typeset pool=$1 typeset disk=${2#$DEV_DSKDIR/} typeset field=${3:-$pool} zpool status -v "$pool" 2>/dev/null | \ awk -v device=$disk -v pool=$pool -v field=$field \ 'BEGIN {startconfig=0; startfield=0; } /config:/ {startconfig=1} (startconfig==1) && ($1==field) {startfield=1; next;} (startfield==1) && ($1==device) {print $2; exit;} (startfield==1) && ($1==field || $1 ~ "^spares$" || $1 ~ "^logs$") {startfield=0}' } # # get the root filesystem name if it's zfsroot system. # # return: root filesystem name function get_rootfs { typeset rootfs="" if is_freebsd; then rootfs=$(mount -p | awk '$2 == "/" && $3 == "zfs" {print $1}') elif ! is_linux; then rootfs=$(awk '$2 == "/" && $3 == "zfs" {print $1}' \ /etc/mnttab) fi if [[ -z "$rootfs" ]]; then log_fail "Can not get rootfs" fi if datasetexists $rootfs; then echo $rootfs else log_fail "This is not a zfsroot system." fi } # # get the rootfs's pool name # return: # rootpool name # function get_rootpool { typeset rootfs=$(get_rootfs) echo ${rootfs%%/*} } # # To verify if the require numbers of disks is given # function verify_disk_count { typeset -i min=${2:-1} typeset -i count=$(echo "$1" | wc -w) if ((count < min)); then log_untested "A minimum of $min disks is required to run." \ " You specified $count disk(s)" fi } function ds_is_volume { typeset type=$(get_prop type $1) [ $type = "volume" ] } function ds_is_filesystem { typeset type=$(get_prop type $1) [ $type = "filesystem" ] } # # Check if Trusted Extensions are installed and enabled # function is_te_enabled { svcs -H -o state labeld 2>/dev/null | grep -q "enabled" } # Return the number of CPUs (cross-platform) function get_num_cpus { if is_linux ; then grep -c '^processor' /proc/cpuinfo elif is_freebsd; then sysctl -n kern.smp.cpus else psrinfo | wc -l fi } # Utility function to determine if a system has multiple cpus. function is_mp { [[ $(get_num_cpus) -gt 1 ]] } function get_cpu_freq { if is_linux; then lscpu | awk '/CPU MHz/ { print $3 }' elif is_freebsd; then sysctl -n hw.clockrate else psrinfo -v 0 | awk '/processor operates at/ {print $6}' fi } # Run the given command as the user provided. function user_run { typeset user=$1 shift log_note "user: $user" log_note "cmd: $*" typeset out=$TEST_BASE_DIR/out typeset err=$TEST_BASE_DIR/err sudo -Eu $user env PATH="$PATH" ksh <<<"$*" >$out 2>$err typeset res=$? log_note "out: $(<$out)" log_note "err: $(<$err)" return $res } # # Check if the pool contains the specified vdevs # # $1 pool # $2..n ... # # Return 0 if the vdevs are contained in the pool, 1 if any of the specified # vdevs is not in the pool, and 2 if pool name is missing. # function vdevs_in_pool { typeset pool=$1 typeset vdev if [[ -z $pool ]]; then log_note "Missing pool name." return 2 fi shift # We could use 'zpool list' to only get the vdevs of the pool but we # can't reference a mirror/raidz vdev using its ID (i.e mirror-0), # therefore we use the 'zpool status' output. typeset tmpfile=$(mktemp) zpool status -v "$pool" | grep -A 1000 "config:" >$tmpfile for vdev in "$@"; do grep -wq ${vdev##*/} $tmpfile || return 1 done rm -f $tmpfile return 0 } function get_max { typeset -l i max=$1 shift for i in "$@"; do max=$((max > i ? max : i)) done echo $max } # Write data that can be compressed into a directory function write_compressible { typeset dir=$1 typeset megs=$2 typeset nfiles=${3:-1} typeset bs=${4:-1024k} typeset fname=${5:-file} [[ -d $dir ]] || log_fail "No directory: $dir" # Under Linux fio is not currently used since its behavior can # differ significantly across versions. This includes missing # command line options and cases where the --buffer_compress_* # options fail to behave as expected. if is_linux; then typeset file_bytes=$(to_bytes $megs) typeset bs_bytes=4096 typeset blocks=$(($file_bytes / $bs_bytes)) for (( i = 0; i < $nfiles; i++ )); do truncate -s $file_bytes $dir/$fname.$i # Write every third block to get 66% compression. for (( j = 0; j < $blocks; j += 3 )); do dd if=/dev/urandom of=$dir/$fname.$i \ seek=$j bs=$bs_bytes count=1 \ conv=notrunc >/dev/null 2>&1 done done else command -v fio > /dev/null || log_unsupported "fio missing" log_must eval fio \ --name=job \ --fallocate=0 \ --minimal \ --randrepeat=0 \ --buffer_compress_percentage=66 \ --buffer_compress_chunk=4096 \ --directory="$dir" \ --numjobs="$nfiles" \ --nrfiles="$nfiles" \ --rw=write \ --bs="$bs" \ --filesize="$megs" \ "--filename_format='$fname.\$jobnum' >/dev/null" fi } function get_objnum { typeset pathname=$1 typeset objnum [[ -e $pathname ]] || log_fail "No such file or directory: $pathname" if is_freebsd; then objnum=$(stat -f "%i" $pathname) else objnum=$(stat -c %i $pathname) fi echo $objnum } # # Sync data to the pool # # $1 pool name # $2 boolean to force uberblock (and config including zpool cache file) update # function sync_pool #pool { typeset pool=${1:-$TESTPOOL} typeset force=${2:-false} if [[ $force == true ]]; then log_must zpool sync -f $pool else log_must zpool sync $pool fi return 0 } # # Sync all pools # # $1 boolean to force uberblock (and config including zpool cache file) update # function sync_all_pools # { typeset force=${1:-false} if [[ $force == true ]]; then log_must zpool sync -f else log_must zpool sync fi return 0 } # # Wait for zpool 'freeing' property drops to zero. # # $1 pool name # function wait_freeing #pool { typeset pool=${1:-$TESTPOOL} while true; do [[ "0" == "$(zpool list -Ho freeing $pool)" ]] && break log_must sleep 1 done } # # Wait for every device replace operation to complete # # $1 pool name # $2 timeout # function wait_replacing #pool timeout { typeset timeout=${2:-300} typeset pool=${1:-$TESTPOOL} for (( timer = 0; timer < $timeout; timer++ )); do is_pool_replacing $pool || break; sleep 1; done } # Wait for a pool to be scrubbed # # $1 pool name # $2 timeout # function wait_scrubbed #pool timeout { typeset timeout=${2:-300} typeset pool=${1:-$TESTPOOL} for (( timer = 0; timer < $timeout; timer++ )); do is_pool_scrubbed $pool && break; sleep 1; done } # Backup the zed.rc in our test directory so that we can edit it for our test. # # Returns: Backup file name. You will need to pass this to zed_rc_restore(). function zed_rc_backup { zedrc_backup="$(mktemp)" cp $ZEDLET_DIR/zed.rc $zedrc_backup echo $zedrc_backup } function zed_rc_restore { mv $1 $ZEDLET_DIR/zed.rc } # # Setup custom environment for the ZED. # # $@ Optional list of zedlets to run under zed. function zed_setup { if ! is_linux; then log_unsupported "No zed on $UNAME" fi if [[ ! -d $ZEDLET_DIR ]]; then log_must mkdir $ZEDLET_DIR fi if [[ ! -e $VDEVID_CONF ]]; then log_must touch $VDEVID_CONF fi if [[ -e $VDEVID_CONF_ETC ]]; then log_fail "Must not have $VDEVID_CONF_ETC file present on system" fi EXTRA_ZEDLETS=$@ # Create a symlink for /etc/zfs/vdev_id.conf file. log_must ln -s $VDEVID_CONF $VDEVID_CONF_ETC # Setup minimal ZED configuration. Individual test cases should # add additional ZEDLETs as needed for their specific test. log_must cp ${ZEDLET_ETC_DIR}/zed.rc $ZEDLET_DIR log_must cp ${ZEDLET_ETC_DIR}/zed-functions.sh $ZEDLET_DIR # Scripts must only be user writable. if [[ -n "$EXTRA_ZEDLETS" ]] ; then saved_umask=$(umask) log_must umask 0022 for i in $EXTRA_ZEDLETS ; do log_must cp ${ZEDLET_LIBEXEC_DIR}/$i $ZEDLET_DIR done log_must umask $saved_umask fi # Customize the zed.rc file to enable the full debug log. log_must sed -i '/\#ZED_DEBUG_LOG=.*/d' $ZEDLET_DIR/zed.rc echo "ZED_DEBUG_LOG=$ZED_DEBUG_LOG" >>$ZEDLET_DIR/zed.rc } # # Cleanup custom ZED environment. # # $@ Optional list of zedlets to remove from our test zed.d directory. function zed_cleanup { if ! is_linux; then return fi for extra_zedlet; do log_must rm -f ${ZEDLET_DIR}/$extra_zedlet done log_must rm -fd ${ZEDLET_DIR}/zed.rc ${ZEDLET_DIR}/zed-functions.sh ${ZEDLET_DIR}/all-syslog.sh ${ZEDLET_DIR}/all-debug.sh ${ZEDLET_DIR}/state \ $ZED_LOG $ZED_DEBUG_LOG $VDEVID_CONF_ETC $VDEVID_CONF \ $ZEDLET_DIR } # # Check if ZED is currently running; if so, returns PIDs # function zed_check { if ! is_linux; then return fi zedpids="$(pgrep -x zed)" zedpids2="$(pgrep -x lt-zed)" echo ${zedpids} ${zedpids2} } # # Check if ZED is currently running, if not start ZED. # function zed_start { if ! is_linux; then return fi # ZEDLET_DIR=/var/tmp/zed if [[ ! -d $ZEDLET_DIR ]]; then log_must mkdir $ZEDLET_DIR fi # Verify the ZED is not already running. zedpids=$(zed_check) if [ -n "$zedpids" ]; then # We never, ever, really want it to just keep going if zed # is already running - usually this implies our test cases # will break very strangely because whatever we wanted to # configure zed for won't be listening to our changes in the # tmpdir log_fail "ZED already running - ${zedpids}" else log_note "Starting ZED" # run ZED in the background and redirect foreground logging # output to $ZED_LOG. log_must truncate -s 0 $ZED_DEBUG_LOG log_must eval "zed -vF -d $ZEDLET_DIR -P $PATH" \ "-s $ZEDLET_DIR/state -j 1 2>$ZED_LOG &" fi return 0 } # # Kill ZED process # function zed_stop { if ! is_linux; then return "" fi log_note "Stopping ZED" while true; do zedpids=$(zed_check) [ ! -n "$zedpids" ] && break log_must kill $zedpids sleep 1 done return 0 } # # Drain all zevents # function zed_events_drain { while [ $(zpool events -H | wc -l) -ne 0 ]; do sleep 1 zpool events -c >/dev/null done } # Set a variable in zed.rc to something, un-commenting it in the process. # # $1 variable # $2 value function zed_rc_set { var="$1" val="$2" # Remove the line cmd="'/$var/d'" eval sed -i $cmd $ZEDLET_DIR/zed.rc # Add it at the end echo "$var=$val" >> $ZEDLET_DIR/zed.rc } # # Check is provided device is being active used as a swap device. # function is_swap_inuse { typeset device=$1 if [[ -z $device ]] ; then log_note "No device specified." return 1 fi case "$UNAME" in Linux) swapon -s | grep -wq $(readlink -f $device) ;; FreeBSD) swapctl -l | grep -wq $device ;; *) swap -l | grep -wq $device ;; esac } # # Setup a swap device using the provided device. # function swap_setup { typeset swapdev=$1 case "$UNAME" in Linux) log_must eval "mkswap $swapdev > /dev/null 2>&1" log_must swapon $swapdev ;; FreeBSD) log_must swapctl -a $swapdev ;; *) log_must swap -a $swapdev ;; esac return 0 } # # Cleanup a swap device on the provided device. # function swap_cleanup { typeset swapdev=$1 if is_swap_inuse $swapdev; then if is_linux; then log_must swapoff $swapdev elif is_freebsd; then log_must swapoff $swapdev else log_must swap -d $swapdev fi fi return 0 } # # Set a global system tunable (64-bit value) # # $1 tunable name (use a NAME defined in tunables.cfg) # $2 tunable values # function set_tunable64 { set_tunable_impl "$1" "$2" Z } # # Set a global system tunable (32-bit value) # # $1 tunable name (use a NAME defined in tunables.cfg) # $2 tunable values # function set_tunable32 { set_tunable_impl "$1" "$2" W } function set_tunable_impl { typeset name="$1" typeset value="$2" typeset mdb_cmd="$3" eval "typeset tunable=\$$name" case "$tunable" in UNSUPPORTED) log_unsupported "Tunable '$name' is unsupported on $UNAME" ;; "") log_fail "Tunable '$name' must be added to tunables.cfg" ;; *) ;; esac [[ -z "$value" ]] && return 1 [[ -z "$mdb_cmd" ]] && return 1 case "$UNAME" in Linux) typeset zfs_tunables="/sys/module/zfs/parameters" echo "$value" >"$zfs_tunables/$tunable" ;; FreeBSD) sysctl vfs.zfs.$tunable=$value ;; SunOS) echo "${tunable}/${mdb_cmd}0t${value}" | mdb -kw ;; esac } +function save_tunable +{ + [[ ! -d $TEST_BASE_DIR ]] && return 1 + [[ -e $TEST_BASE_DIR/tunable-$1 ]] && return 2 + echo "$(get_tunable """$1""")" > "$TEST_BASE_DIR"/tunable-"$1" +} + +function restore_tunable +{ + [[ ! -e $TEST_BASE_DIR/tunable-$1 ]] && return 1 + val="$(cat $TEST_BASE_DIR/tunable-"""$1""")" + set_tunable64 "$1" "$val" + rm $TEST_BASE_DIR/tunable-$1 +} + # # Get a global system tunable # # $1 tunable name (use a NAME defined in tunables.cfg) # function get_tunable { get_tunable_impl "$1" } function get_tunable_impl { typeset name="$1" typeset module="${2:-zfs}" typeset check_only="$3" eval "typeset tunable=\$$name" case "$tunable" in UNSUPPORTED) if [ -z "$check_only" ] ; then log_unsupported "Tunable '$name' is unsupported on $UNAME" else return 1 fi ;; "") if [ -z "$check_only" ] ; then log_fail "Tunable '$name' must be added to tunables.cfg" else return 1 fi ;; *) ;; esac case "$UNAME" in Linux) typeset zfs_tunables="/sys/module/$module/parameters" cat $zfs_tunables/$tunable ;; FreeBSD) sysctl -n vfs.zfs.$tunable ;; SunOS) [[ "$module" -eq "zfs" ]] || return 1 ;; esac } # Does a tunable exist? # # $1: Tunable name function tunable_exists { get_tunable_impl $1 "zfs" 1 } # # Compute MD5 digest for given file or stdin if no file given. # Note: file path must not contain spaces # function md5digest { typeset file=$1 case "$UNAME" in FreeBSD) md5 -q $file ;; *) typeset sum _ read -r sum _ < <(md5sum -b $file) echo $sum ;; esac } # # Compute SHA256 digest for given file or stdin if no file given. # Note: file path must not contain spaces # function sha256digest { typeset file=$1 case "$UNAME" in FreeBSD) sha256 -q $file ;; *) typeset sum _ read -r sum _ < <(sha256sum -b $file) echo $sum ;; esac } function new_fs # { case "$UNAME" in FreeBSD) newfs "$@" ;; *) echo y | newfs -v "$@" ;; esac } function stat_size # { typeset path=$1 case "$UNAME" in FreeBSD) stat -f %z "$path" ;; *) stat -c %s "$path" ;; esac } function stat_mtime # { typeset path=$1 case "$UNAME" in FreeBSD) stat -f %m "$path" ;; *) stat -c %Y "$path" ;; esac } function stat_ctime # { typeset path=$1 case "$UNAME" in FreeBSD) stat -f %c "$path" ;; *) stat -c %Z "$path" ;; esac } function stat_crtime # { typeset path=$1 case "$UNAME" in FreeBSD) stat -f %B "$path" ;; *) stat -c %W "$path" ;; esac } function stat_generation # { typeset path=$1 case "$UNAME" in Linux) getversion "${path}" ;; *) stat -f %v "${path}" ;; esac } # Run a command as if it was being run in a TTY. # # Usage: # # faketty command # function faketty { if is_freebsd; then script -q /dev/null env "$@" else script --return --quiet -c "$*" /dev/null fi } # # Produce a random permutation of the integers in a given range (inclusive). # function range_shuffle # begin end { typeset -i begin=$1 typeset -i end=$2 seq ${begin} ${end} | sort -R } # # Cross-platform xattr helpers # function get_xattr # name path { typeset name=$1 typeset path=$2 case "$UNAME" in FreeBSD) getextattr -qq user "${name}" "${path}" ;; *) attr -qg "${name}" "${path}" ;; esac } function set_xattr # name value path { typeset name=$1 typeset value=$2 typeset path=$3 case "$UNAME" in FreeBSD) setextattr user "${name}" "${value}" "${path}" ;; *) attr -qs "${name}" -V "${value}" "${path}" ;; esac } function set_xattr_stdin # name value { typeset name=$1 typeset path=$2 case "$UNAME" in FreeBSD) setextattr -i user "${name}" "${path}" ;; *) attr -qs "${name}" "${path}" ;; esac } function rm_xattr # name path { typeset name=$1 typeset path=$2 case "$UNAME" in FreeBSD) rmextattr -q user "${name}" "${path}" ;; *) attr -qr "${name}" "${path}" ;; esac } function ls_xattr # path { typeset path=$1 case "$UNAME" in FreeBSD) lsextattr -qq user "${path}" ;; *) attr -ql "${path}" ;; esac } function kstat # stat flags? { typeset stat=$1 typeset flags=${2-"-n"} case "$UNAME" in FreeBSD) sysctl $flags kstat.zfs.misc.$stat ;; Linux) cat "/proc/spl/kstat/zfs/$stat" 2>/dev/null ;; *) false ;; esac } function get_arcstat # stat { typeset stat=$1 case "$UNAME" in FreeBSD) kstat arcstats.$stat ;; Linux) kstat arcstats | awk "/$stat/"' { print $3 }' ;; *) false ;; esac } function punch_hole # offset length file { typeset offset=$1 typeset length=$2 typeset file=$3 case "$UNAME" in FreeBSD) truncate -d -o $offset -l $length "$file" ;; Linux) fallocate --punch-hole --offset $offset --length $length "$file" ;; *) false ;; esac } function zero_range # offset length file { typeset offset=$1 typeset length=$2 typeset file=$3 case "$UNAME" in Linux) fallocate --zero-range --offset $offset --length $length "$file" ;; *) false ;; esac } # # Wait for the specified arcstat to reach non-zero quiescence. # If echo is 1 echo the value after reaching quiescence, otherwise # if echo is 0 print the arcstat we are waiting on. # function arcstat_quiescence # stat echo { typeset stat=$1 typeset echo=$2 typeset do_once=true if [[ $echo -eq 0 ]]; then echo "Waiting for arcstat $1 quiescence." fi while $do_once || [ $stat1 -ne $stat2 ] || [ $stat2 -eq 0 ]; do typeset stat1=$(get_arcstat $stat) sleep 0.5 typeset stat2=$(get_arcstat $stat) do_once=false done if [[ $echo -eq 1 ]]; then echo $stat2 fi } function arcstat_quiescence_noecho # stat { typeset stat=$1 arcstat_quiescence $stat 0 } function arcstat_quiescence_echo # stat { typeset stat=$1 arcstat_quiescence $stat 1 } # # Given an array of pids, wait until all processes # have completed and check their return status. # function wait_for_children #children { rv=0 children=("$@") for child in "${children[@]}" do child_exit=0 wait ${child} || child_exit=$? if [ $child_exit -ne 0 ]; then echo "child ${child} failed with ${child_exit}" rv=1 fi done return $rv } # # Compare two directory trees recursively in a manner similar to diff(1), but # using rsync. If there are any discrepancies, a summary of the differences are # output and a non-zero error is returned. # # If you're comparing a directory after a ZIL replay, you should set # LIBTEST_DIFF_ZIL_REPLAY=1 or use replay_directory_diff which will cause # directory_diff to ignore mtime changes (the ZIL replay won't fix up mtime # information). # function directory_diff # dir_a dir_b { dir_a="$1" dir_b="$2" zil_replay="${LIBTEST_DIFF_ZIL_REPLAY:-0}" # If one of the directories doesn't exist, return 2. This is to match the # semantics of diff. if ! [ -d "$dir_a" -a -d "$dir_b" ]; then return 2 fi # Run rsync with --dry-run --itemize-changes to get something akin to diff # output, but rsync is far more thorough in detecting differences (diff # doesn't compare file metadata, and cannot handle special files). # # Also make sure to filter out non-user.* xattrs when comparing. On # SELinux-enabled systems the copied tree will probably have different # SELinux labels. args=("-nicaAHX" '--filter=-x! user.*' "--delete") # NOTE: Quite a few rsync builds do not support --crtimes which would be # necessary to verify that creation times are being maintained properly. # Unfortunately because of this we cannot use it unconditionally but we can # check if this rsync build supports it and use it then. This check is # based on the same check in the rsync test suite (testsuite/crtimes.test). # # We check ctimes even with zil_replay=1 because the ZIL does store # creation times and we should make sure they match (if the creation times # do not match there is a "c" entry in one of the columns). if rsync --version | grep -q "[, ] crtimes"; then args+=("--crtimes") else log_note "This rsync package does not support --crtimes (-N)." fi # If we are testing a ZIL replay, we need to ignore timestamp changes. # Unfortunately --no-times doesn't do what we want -- it will still tell # you if the timestamps don't match but rsync will set the timestamps to # the current time (leading to an itemised change entry). It's simpler to # just filter out those lines. if [ "$zil_replay" -eq 0 ]; then filter=("cat") else # Different rsync versions have different numbers of columns. So just # require that aside from the first two, all other columns must be # blank (literal ".") or a timestamp field ("[tT]"). filter=("grep" "-v" '^\..[.Tt]\+ ') fi diff="$(rsync "${args[@]}" "$dir_a/" "$dir_b/" | "${filter[@]}")" rv=0 if [ -n "$diff" ]; then echo "$diff" rv=1 fi return $rv } # # Compare two directory trees recursively, without checking whether the mtimes # match (creation times will be checked if the available rsync binary supports # it). This is necessary for ZIL replay checks (because the ZIL does not # contain mtimes and thus after a ZIL replay, mtimes won't match). # # This is shorthand for LIBTEST_DIFF_ZIL_REPLAY=1 directory_diff <...>. # function replay_directory_diff # dir_a dir_b { LIBTEST_DIFF_ZIL_REPLAY=1 directory_diff "$@" } # # Put coredumps into $1/core.{basename} # # Output must be saved and passed to pop_coredump_pattern on cleanup # function push_coredump_pattern # dir { ulimit -c unlimited case "$UNAME" in Linux) cat /proc/sys/kernel/core_pattern /proc/sys/kernel/core_uses_pid echo "$1/core.%e" >/proc/sys/kernel/core_pattern && echo 0 >/proc/sys/kernel/core_uses_pid ;; FreeBSD) sysctl -n kern.corefile sysctl kern.corefile="$1/core.%N" >/dev/null ;; *) # Nothing to output – set only for this shell coreadm -p "$1/core.%f" ;; esac } # # Put coredumps back into the default location # function pop_coredump_pattern { [ -s "$1" ] || return 0 case "$UNAME" in Linux) typeset pat pid { read -r pat; read -r pid; } < "$1" echo "$pat" >/proc/sys/kernel/core_pattern && echo "$pid" >/proc/sys/kernel/core_uses_pid ;; FreeBSD) sysctl kern.corefile="$(<"$1")" >/dev/null ;; esac } diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg index 8010a9451597..a0edad14d028 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg @@ -1,101 +1,102 @@ # This file exports variables for each tunable used in the test suite. # # Different platforms use different names for most tunables. To avoid littering # the tests with conditional logic for deciding how to set each tunable, the # logic is instead consolidated to this one file. # # Any use of tunables in tests must use a name defined here. New entries # should be added to the table as needed. Please keep the table sorted # alphabetically for ease of maintenance. # # Platform-specific tunables should still use a NAME from this table for # consistency. Enter UNSUPPORTED in the column for platforms on which the # tunable is not implemented. UNAME=$(uname) # NAME FreeBSD tunable Linux tunable cat <<%%%% | ADMIN_SNAPSHOT UNSUPPORTED zfs_admin_snapshot ALLOW_REDACTED_DATASET_MOUNT allow_redacted_dataset_mount zfs_allow_redacted_dataset_mount ARC_MAX arc.max zfs_arc_max ARC_MIN arc.min zfs_arc_min ASYNC_BLOCK_MAX_BLOCKS async_block_max_blocks zfs_async_block_max_blocks CHECKSUM_EVENTS_PER_SECOND checksum_events_per_second zfs_checksum_events_per_second COMMIT_TIMEOUT_PCT commit_timeout_pct zfs_commit_timeout_pct COMPRESSED_ARC_ENABLED compressed_arc_enabled zfs_compressed_arc_enabled CONDENSE_INDIRECT_COMMIT_ENTRY_DELAY_MS condense.indirect_commit_entry_delay_ms zfs_condense_indirect_commit_entry_delay_ms CONDENSE_INDIRECT_OBSOLETE_PCT condense.indirect_obsolete_pct zfs_condense_indirect_obsolete_pct CONDENSE_MIN_MAPPING_BYTES condense.min_mapping_bytes zfs_condense_min_mapping_bytes DBUF_CACHE_SHIFT dbuf.cache_shift dbuf_cache_shift DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export LUA_MAX_MEMLIMIT lua.max_memlimit zfs_lua_max_memlimit L2ARC_MFUONLY l2arc.mfuonly l2arc_mfuonly L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch L2ARC_REBUILD_BLOCKS_MIN_L2SIZE l2arc.rebuild_blocks_min_l2size l2arc_rebuild_blocks_min_l2size L2ARC_REBUILD_ENABLED l2arc.rebuild_enabled l2arc_rebuild_enabled L2ARC_TRIM_AHEAD l2arc.trim_ahead l2arc_trim_ahead L2ARC_WRITE_BOOST l2arc.write_boost l2arc_write_boost L2ARC_WRITE_MAX l2arc.write_max l2arc_write_max LIVELIST_CONDENSE_NEW_ALLOC livelist.condense.new_alloc zfs_livelist_condense_new_alloc LIVELIST_CONDENSE_SYNC_CANCEL livelist.condense.sync_cancel zfs_livelist_condense_sync_cancel LIVELIST_CONDENSE_SYNC_PAUSE livelist.condense.sync_pause zfs_livelist_condense_sync_pause LIVELIST_CONDENSE_ZTHR_CANCEL livelist.condense.zthr_cancel zfs_livelist_condense_zthr_cancel LIVELIST_CONDENSE_ZTHR_PAUSE livelist.condense.zthr_pause zfs_livelist_condense_zthr_pause LIVELIST_MAX_ENTRIES livelist.max_entries zfs_livelist_max_entries LIVELIST_MIN_PERCENT_SHARED livelist.min_percent_shared zfs_livelist_min_percent_shared MAX_DATASET_NESTING max_dataset_nesting zfs_max_dataset_nesting MAX_MISSING_TVDS max_missing_tvds zfs_max_missing_tvds METASLAB_DEBUG_LOAD metaslab.debug_load metaslab_debug_load METASLAB_FORCE_GANGING metaslab.force_ganging metaslab_force_ganging MULTIHOST_FAIL_INTERVALS multihost.fail_intervals zfs_multihost_fail_intervals MULTIHOST_HISTORY multihost.history zfs_multihost_history MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_intervals MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms SCAN_LEGACY scan_legacy zfs_scan_legacy SCAN_SUSPEND_PROGRESS scan_suspend_progress zfs_scan_suspend_progress SCAN_VDEV_LIMIT scan_vdev_limit zfs_scan_vdev_limit SEND_HOLES_WITHOUT_BIRTH_TIME send_holes_without_birth_time send_holes_without_birth_time SLOW_IO_EVENTS_PER_SECOND slow_io_events_per_second zfs_slow_io_events_per_second SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min TRIM_METASLAB_SKIP trim.metaslab_skip zfs_trim_metaslab_skip TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch TXG_HISTORY txg.history zfs_txg_history TXG_TIMEOUT txg.timeout zfs_txg_timeout UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED -VOL_USE_BLK_MQ UNSUPPORTED UNSUPPORTED +VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq +BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max ZIO_SLOW_IO_MS zio.slow_io_ms zio_slow_io_ms ZIL_SAXATTR zil_saxattr zfs_zil_saxattr %%%% while read name FreeBSD Linux; do eval "export ${name}=\$${UNAME}" done diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am index 158401e078aa..87b50f59ca7a 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am @@ -1,2073 +1,2074 @@ CLEANFILES = dist_noinst_DATA = include $(top_srcdir)/config/Substfiles.am datadir_zfs_tests_testsdir = $(datadir)/$(PACKAGE)/zfs-tests/tests nobase_dist_datadir_zfs_tests_tests_DATA = \ perf/nfs-sample.cfg \ perf/perf.shlib \ \ perf/fio/mkfiles.fio \ perf/fio/random_reads.fio \ perf/fio/random_readwrite.fio \ perf/fio/random_readwrite_fixed.fio \ perf/fio/random_writes.fio \ perf/fio/sequential_reads.fio \ perf/fio/sequential_readwrite.fio \ perf/fio/sequential_writes.fio nobase_dist_datadir_zfs_tests_tests_SCRIPTS = \ perf/regression/random_reads.ksh \ perf/regression/random_readwrite.ksh \ perf/regression/random_readwrite_fixed.ksh \ perf/regression/random_writes.ksh \ perf/regression/random_writes_zil.ksh \ perf/regression/sequential_reads_arc_cached_clone.ksh \ perf/regression/sequential_reads_arc_cached.ksh \ perf/regression/sequential_reads_dbuf_cached.ksh \ perf/regression/sequential_reads.ksh \ perf/regression/sequential_writes.ksh \ perf/regression/setup.ksh \ \ perf/scripts/prefetch_io.sh # These lists can be regenerated by running make regen-tests at the root, or, on a *clean* source: # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable -name '*.in' | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' -executable -name '*.in' | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -name '*.in' ! -name '*.c' | grep -Fe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # # simd and tmpfile are Linux-only and not installed elsewhere # # C programs are specced in ../Makefile.am above as part of the main Makefile find_common := find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' regen: @$(MAKE) -C $(top_builddir) clean @$(MAKE) clean $(SED) $(ac_inplace) '/^# -- >8 --/q' Makefile.am echo >> Makefile.am echo 'nobase_nodist_datadir_zfs_tests_tests_DATA = \' >> Makefile.am $(find_common) ! -executable -name '*.in' | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo 'nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \' >> Makefile.am $(find_common) -executable -name '*.in' | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo >> Makefile.am echo 'SUBSTFILES += $$(nobase_nodist_datadir_zfs_tests_tests_DATA) $$(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)' >> Makefile.am echo >> Makefile.am echo 'if BUILD_LINUX' >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am $(find_common) ! -name '*.in' ! -name '*.c' | grep -Fe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo 'endif' >> Makefile.am echo >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_DATA += \' >> Makefile.am $(find_common) ! -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am $(find_common) -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am # -- >8 -- nobase_nodist_datadir_zfs_tests_tests_DATA = \ functional/pam/utilities.kshlib nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \ functional/pyzfs/pyzfs_unittest.ksh SUBSTFILES += $(nobase_nodist_datadir_zfs_tests_tests_DATA) $(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS) if BUILD_LINUX nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/simd/simd_supported.ksh \ functional/tmpfile/cleanup.ksh \ functional/tmpfile/setup.ksh endif nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/acl/acl.cfg \ functional/acl/acl_common.kshlib \ functional/alloc_class/alloc_class.cfg \ functional/alloc_class/alloc_class.kshlib \ functional/atime/atime.cfg \ functional/atime/atime_common.kshlib \ functional/block_cloning/block_cloning.kshlib \ functional/cache/cache.cfg \ functional/cache/cache.kshlib \ functional/cachefile/cachefile.cfg \ functional/cachefile/cachefile.kshlib \ functional/casenorm/casenorm.cfg \ functional/casenorm/casenorm.kshlib \ functional/channel_program/channel_common.kshlib \ functional/channel_program/lua_core/tst.args_to_lua.out \ functional/channel_program/lua_core/tst.args_to_lua.zcp \ functional/channel_program/lua_core/tst.divide_by_zero.err \ functional/channel_program/lua_core/tst.divide_by_zero.zcp \ functional/channel_program/lua_core/tst.exists.zcp \ functional/channel_program/lua_core/tst.large_prog.out \ functional/channel_program/lua_core/tst.large_prog.zcp \ functional/channel_program/lua_core/tst.lib_base.lua \ functional/channel_program/lua_core/tst.lib_coroutine.lua \ functional/channel_program/lua_core/tst.lib_strings.lua \ functional/channel_program/lua_core/tst.lib_table.lua \ functional/channel_program/lua_core/tst.nested_neg.zcp \ functional/channel_program/lua_core/tst.nested_pos.zcp \ functional/channel_program/lua_core/tst.recursive.zcp \ functional/channel_program/lua_core/tst.return_large.zcp \ functional/channel_program/lua_core/tst.return_recursive_table.zcp \ functional/channel_program/lua_core/tst.stack_gsub.err \ functional/channel_program/lua_core/tst.stack_gsub.zcp \ functional/channel_program/lua_core/tst.timeout.zcp \ functional/channel_program/synctask_core/tst.bookmark.copy.zcp \ functional/channel_program/synctask_core/tst.bookmark.create.zcp \ functional/channel_program/synctask_core/tst.get_index_props.out \ functional/channel_program/synctask_core/tst.get_index_props.zcp \ functional/channel_program/synctask_core/tst.get_number_props.out \ functional/channel_program/synctask_core/tst.get_number_props.zcp \ functional/channel_program/synctask_core/tst.get_string_props.out \ functional/channel_program/synctask_core/tst.get_string_props.zcp \ functional/channel_program/synctask_core/tst.promote_conflict.zcp \ functional/channel_program/synctask_core/tst.set_props.zcp \ functional/channel_program/synctask_core/tst.snapshot_destroy.zcp \ functional/channel_program/synctask_core/tst.snapshot_neg.zcp \ functional/channel_program/synctask_core/tst.snapshot_recursive.zcp \ functional/channel_program/synctask_core/tst.snapshot_rename.zcp \ functional/channel_program/synctask_core/tst.snapshot_simple.zcp \ functional/checksum/default.cfg \ functional/clean_mirror/clean_mirror_common.kshlib \ functional/clean_mirror/default.cfg \ functional/cli_root/cli_common.kshlib \ functional/cli_root/zfs_copies/zfs_copies.cfg \ functional/cli_root/zfs_copies/zfs_copies.kshlib \ functional/cli_root/zfs_create/properties.kshlib \ functional/cli_root/zfs_create/zfs_create.cfg \ functional/cli_root/zfs_create/zfs_create_common.kshlib \ functional/cli_root/zfs_destroy/zfs_destroy.cfg \ functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib \ functional/cli_root/zfs_get/zfs_get_common.kshlib \ functional/cli_root/zfs_get/zfs_get_list_d.kshlib \ functional/cli_root/zfs_jail/jail.conf \ functional/cli_root/zfs_load-key/HEXKEY \ functional/cli_root/zfs_load-key/PASSPHRASE \ functional/cli_root/zfs_load-key/RAWKEY \ functional/cli_root/zfs_load-key/zfs_load-key.cfg \ functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib \ functional/cli_root/zfs_mount/zfs_mount.cfg \ functional/cli_root/zfs_mount/zfs_mount.kshlib \ functional/cli_root/zfs_promote/zfs_promote.cfg \ functional/cli_root/zfs_receive/zstd_test_data.txt \ functional/cli_root/zfs_rename/zfs_rename.cfg \ functional/cli_root/zfs_rename/zfs_rename.kshlib \ functional/cli_root/zfs_rollback/zfs_rollback.cfg \ functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib \ functional/cli_root/zfs_send/zfs_send.cfg \ functional/cli_root/zfs_set/zfs_set_common.kshlib \ functional/cli_root/zfs_share/zfs_share.cfg \ functional/cli_root/zfs_snapshot/zfs_snapshot.cfg \ functional/cli_root/zfs_unmount/zfs_unmount.cfg \ functional/cli_root/zfs_unmount/zfs_unmount.kshlib \ functional/cli_root/zfs_upgrade/zfs_upgrade.kshlib \ functional/cli_root/zfs_wait/zfs_wait.kshlib \ functional/cli_root/zpool_add/zpool_add.cfg \ functional/cli_root/zpool_add/zpool_add.kshlib \ functional/cli_root/zpool_clear/zpool_clear.cfg \ functional/cli_root/zpool_create/draidcfg.gz \ functional/cli_root/zpool_create/zpool_create.cfg \ functional/cli_root/zpool_create/zpool_create.shlib \ functional/cli_root/zpool_destroy/zpool_destroy.cfg \ functional/cli_root/zpool_events/zpool_events.cfg \ functional/cli_root/zpool_events/zpool_events.kshlib \ functional/cli_root/zpool_expand/zpool_expand.cfg \ functional/cli_root/zpool_export/zpool_export.cfg \ functional/cli_root/zpool_export/zpool_export.kshlib \ functional/cli_root/zpool_get/vdev_get.cfg \ functional/cli_root/zpool_get/zpool_get.cfg \ functional/cli_root/zpool_get/zpool_get_parsable.cfg \ functional/cli_root/zpool_import/blockfiles/cryptv0.dat.bz2 \ functional/cli_root/zpool_import/blockfiles/missing_ivset.dat.bz2 \ functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 \ functional/cli_root/zpool_import/zpool_import.cfg \ functional/cli_root/zpool_import/zpool_import.kshlib \ functional/cli_root/zpool_initialize/zpool_initialize.kshlib \ functional/cli_root/zpool_labelclear/labelclear.cfg \ functional/cli_root/zpool_remove/zpool_remove.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.shlib \ functional/cli_root/zpool_resilver/zpool_resilver.cfg \ functional/cli_root/zpool_scrub/zpool_scrub.cfg \ functional/cli_root/zpool_split/zpool_split.cfg \ functional/cli_root/zpool_trim/zpool_trim.kshlib \ functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v10.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v11.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v12.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v13.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v14.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz21.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz22.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz23.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v4.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v5.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v6.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v7.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v8.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v999.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v9.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-vBROKEN.dat.bz2 \ functional/cli_root/zpool_upgrade/zpool_upgrade.cfg \ functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib \ functional/cli_root/zpool_wait/zpool_wait.kshlib \ functional/cli_root/zhack/library.kshlib \ functional/cli_user/misc/misc.cfg \ functional/cli_user/zfs_list/zfs_list.cfg \ functional/cli_user/zfs_list/zfs_list.kshlib \ functional/compression/compress.cfg \ functional/compression/testpool_zstd.tar.gz \ functional/deadman/deadman.cfg \ functional/delegate/delegate.cfg \ functional/delegate/delegate_common.kshlib \ functional/devices/devices.cfg \ functional/devices/devices_common.kshlib \ functional/events/events.cfg \ functional/events/events_common.kshlib \ functional/fault/fault.cfg \ functional/grow/grow.cfg \ functional/history/history.cfg \ functional/history/history_common.kshlib \ functional/history/i386.migratedpool.DAT.Z \ functional/history/i386.orig_history.txt \ functional/history/sparc.migratedpool.DAT.Z \ functional/history/sparc.orig_history.txt \ functional/history/zfs-pool-v4.dat.Z \ functional/inheritance/config001.cfg \ functional/inheritance/config002.cfg \ functional/inheritance/config003.cfg \ functional/inheritance/config004.cfg \ functional/inheritance/config005.cfg \ functional/inheritance/config006.cfg \ functional/inheritance/config007.cfg \ functional/inheritance/config008.cfg \ functional/inheritance/config009.cfg \ functional/inheritance/config010.cfg \ functional/inheritance/config011.cfg \ functional/inheritance/config012.cfg \ functional/inheritance/config013.cfg \ functional/inheritance/config014.cfg \ functional/inheritance/config015.cfg \ functional/inheritance/config016.cfg \ functional/inheritance/config017.cfg \ functional/inheritance/config018.cfg \ functional/inheritance/config019.cfg \ functional/inheritance/config020.cfg \ functional/inheritance/config021.cfg \ functional/inheritance/config022.cfg \ functional/inheritance/config023.cfg \ functional/inheritance/config024.cfg \ functional/inheritance/inherit.kshlib \ functional/inheritance/README.config \ functional/inheritance/README.state \ functional/inheritance/state001.cfg \ functional/inheritance/state002.cfg \ functional/inheritance/state003.cfg \ functional/inheritance/state004.cfg \ functional/inheritance/state005.cfg \ functional/inheritance/state006.cfg \ functional/inheritance/state007.cfg \ functional/inheritance/state008.cfg \ functional/inheritance/state009.cfg \ functional/inheritance/state010.cfg \ functional/inheritance/state011.cfg \ functional/inheritance/state012.cfg \ functional/inheritance/state013.cfg \ functional/inheritance/state014.cfg \ functional/inheritance/state015.cfg \ functional/inheritance/state016.cfg \ functional/inheritance/state017.cfg \ functional/inheritance/state018.cfg \ functional/inheritance/state019.cfg \ functional/inheritance/state020.cfg \ functional/inheritance/state021.cfg \ functional/inheritance/state022.cfg \ functional/inheritance/state023.cfg \ functional/inheritance/state024.cfg \ functional/inuse/inuse.cfg \ functional/io/io.cfg \ functional/l2arc/l2arc.cfg \ functional/largest_pool/largest_pool.cfg \ functional/migration/migration.cfg \ functional/migration/migration.kshlib \ functional/mmap/mmap.cfg \ functional/mmp/mmp.cfg \ functional/mmp/mmp.kshlib \ functional/mv_files/mv_files.cfg \ functional/mv_files/mv_files_common.kshlib \ functional/nopwrite/nopwrite.shlib \ functional/no_space/enospc.cfg \ functional/online_offline/online_offline.cfg \ functional/pool_checkpoint/pool_checkpoint.kshlib \ functional/projectquota/projectquota.cfg \ functional/projectquota/projectquota_common.kshlib \ functional/quota/quota.cfg \ functional/quota/quota.kshlib \ functional/redacted_send/redacted.cfg \ functional/redacted_send/redacted.kshlib \ functional/redundancy/redundancy.cfg \ functional/redundancy/redundancy.kshlib \ functional/refreserv/refreserv.cfg \ functional/removal/removal.kshlib \ functional/replacement/replacement.cfg \ functional/reservation/reservation.cfg \ functional/reservation/reservation.shlib \ functional/rsend/dedup_encrypted_zvol.bz2 \ functional/rsend/dedup_encrypted_zvol.zsend.bz2 \ functional/rsend/dedup.zsend.bz2 \ functional/rsend/fs.tar.gz \ functional/rsend/rsend.cfg \ functional/rsend/rsend.kshlib \ functional/scrub_mirror/default.cfg \ functional/scrub_mirror/scrub_mirror_common.kshlib \ functional/slog/slog.cfg \ functional/slog/slog.kshlib \ functional/snapshot/snapshot.cfg \ functional/snapused/snapused.kshlib \ functional/sparse/sparse.cfg \ functional/trim/trim.cfg \ functional/trim/trim.kshlib \ functional/truncate/truncate.cfg \ functional/upgrade/upgrade_common.kshlib \ functional/user_namespace/user_namespace.cfg \ functional/user_namespace/user_namespace_common.kshlib \ functional/userquota/13709_reproducer.bz2 \ functional/userquota/userquota.cfg \ functional/userquota/userquota_common.kshlib \ functional/vdev_zaps/vdev_zaps.kshlib \ functional/xattr/xattr.cfg \ functional/xattr/xattr_common.kshlib \ functional/zvol/zvol.cfg \ functional/zvol/zvol_cli/zvol_cli.cfg \ functional/zvol/zvol_common.shlib \ functional/zvol/zvol_ENOSPC/zvol_ENOSPC.cfg \ functional/zvol/zvol_misc/zvol_misc_common.kshlib \ functional/zvol/zvol_swap/zvol_swap.cfg \ functional/idmap_mount/idmap_mount.cfg \ functional/idmap_mount/idmap_mount_common.kshlib nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/acl/off/cleanup.ksh \ functional/acl/off/dosmode.ksh \ functional/acl/off/posixmode.ksh \ functional/acl/off/setup.ksh \ functional/acl/posix/cleanup.ksh \ functional/acl/posix/posix_001_pos.ksh \ functional/acl/posix/posix_002_pos.ksh \ functional/acl/posix/posix_003_pos.ksh \ functional/acl/posix/posix_004_pos.ksh \ functional/acl/posix-sa/cleanup.ksh \ functional/acl/posix-sa/posix_001_pos.ksh \ functional/acl/posix-sa/posix_002_pos.ksh \ functional/acl/posix-sa/posix_003_pos.ksh \ functional/acl/posix-sa/posix_004_pos.ksh \ functional/acl/posix-sa/setup.ksh \ functional/acl/posix/setup.ksh \ functional/alloc_class/alloc_class_001_pos.ksh \ functional/alloc_class/alloc_class_002_neg.ksh \ functional/alloc_class/alloc_class_003_pos.ksh \ functional/alloc_class/alloc_class_004_pos.ksh \ functional/alloc_class/alloc_class_005_pos.ksh \ functional/alloc_class/alloc_class_006_pos.ksh \ functional/alloc_class/alloc_class_007_pos.ksh \ functional/alloc_class/alloc_class_008_pos.ksh \ functional/alloc_class/alloc_class_009_pos.ksh \ functional/alloc_class/alloc_class_010_pos.ksh \ functional/alloc_class/alloc_class_011_neg.ksh \ functional/alloc_class/alloc_class_012_pos.ksh \ functional/alloc_class/alloc_class_013_pos.ksh \ functional/alloc_class/alloc_class_014_neg.ksh \ functional/alloc_class/alloc_class_015_pos.ksh \ functional/alloc_class/cleanup.ksh \ functional/alloc_class/setup.ksh \ functional/append/file_append.ksh \ functional/append/threadsappend_001_pos.ksh \ functional/append/cleanup.ksh \ functional/append/setup.ksh \ functional/arc/arcstats_runtime_tuning.ksh \ functional/arc/cleanup.ksh \ functional/arc/dbufstats_001_pos.ksh \ functional/arc/dbufstats_002_pos.ksh \ functional/arc/dbufstats_003_pos.ksh \ functional/arc/setup.ksh \ functional/atime/atime_001_pos.ksh \ functional/atime/atime_002_neg.ksh \ functional/atime/atime_003_pos.ksh \ functional/atime/cleanup.ksh \ functional/atime/root_atime_off.ksh \ functional/atime/root_atime_on.ksh \ functional/atime/root_relatime_on.ksh \ functional/atime/setup.ksh \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \ functional/block_cloning/block_cloning_copyfilerange.ksh \ functional/block_cloning/block_cloning_copyfilerange_partial.ksh \ functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \ functional/block_cloning/block_cloning_disabled_ficlone.ksh \ functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlone.ksh \ functional/block_cloning/block_cloning_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlonerange_partial.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ functional/bootfs/bootfs_004_neg.ksh \ functional/bootfs/bootfs_005_neg.ksh \ functional/bootfs/bootfs_006_pos.ksh \ functional/bootfs/bootfs_007_pos.ksh \ functional/bootfs/bootfs_008_pos.ksh \ functional/bootfs/cleanup.ksh \ functional/bootfs/setup.ksh \ functional/btree/btree_negative.ksh \ functional/btree/btree_positive.ksh \ functional/cache/cache_001_pos.ksh \ functional/cache/cache_002_pos.ksh \ functional/cache/cache_003_pos.ksh \ functional/cache/cache_004_neg.ksh \ functional/cache/cache_005_neg.ksh \ functional/cache/cache_006_pos.ksh \ functional/cache/cache_007_neg.ksh \ functional/cache/cache_008_neg.ksh \ functional/cache/cache_009_pos.ksh \ functional/cache/cache_010_pos.ksh \ functional/cache/cache_011_pos.ksh \ functional/cache/cache_012_pos.ksh \ functional/cache/cleanup.ksh \ functional/cachefile/cachefile_001_pos.ksh \ functional/cachefile/cachefile_002_pos.ksh \ functional/cachefile/cachefile_003_pos.ksh \ functional/cachefile/cachefile_004_pos.ksh \ functional/cachefile/cleanup.ksh \ functional/cachefile/setup.ksh \ functional/cache/setup.ksh \ functional/casenorm/case_all_values.ksh \ functional/casenorm/cleanup.ksh \ functional/casenorm/insensitive_formd_delete.ksh \ functional/casenorm/insensitive_formd_lookup.ksh \ functional/casenorm/insensitive_none_delete.ksh \ functional/casenorm/insensitive_none_lookup.ksh \ functional/casenorm/mixed_create_failure.ksh \ functional/casenorm/mixed_formd_delete.ksh \ functional/casenorm/mixed_formd_lookup_ci.ksh \ functional/casenorm/mixed_formd_lookup.ksh \ functional/casenorm/mixed_none_delete.ksh \ functional/casenorm/mixed_none_lookup_ci.ksh \ functional/casenorm/mixed_none_lookup.ksh \ functional/casenorm/norm_all_values.ksh \ functional/casenorm/sensitive_formd_delete.ksh \ functional/casenorm/sensitive_formd_lookup.ksh \ functional/casenorm/sensitive_none_delete.ksh \ functional/casenorm/sensitive_none_lookup.ksh \ functional/casenorm/setup.ksh \ functional/channel_program/lua_core/cleanup.ksh \ functional/channel_program/lua_core/setup.ksh \ functional/channel_program/lua_core/tst.args_to_lua.ksh \ functional/channel_program/lua_core/tst.divide_by_zero.ksh \ functional/channel_program/lua_core/tst.exists.ksh \ functional/channel_program/lua_core/tst.integer_illegal.ksh \ functional/channel_program/lua_core/tst.integer_overflow.ksh \ functional/channel_program/lua_core/tst.language_functions_neg.ksh \ functional/channel_program/lua_core/tst.language_functions_pos.ksh \ functional/channel_program/lua_core/tst.large_prog.ksh \ functional/channel_program/lua_core/tst.libraries.ksh \ functional/channel_program/lua_core/tst.memory_limit.ksh \ functional/channel_program/lua_core/tst.nested_neg.ksh \ functional/channel_program/lua_core/tst.nested_pos.ksh \ functional/channel_program/lua_core/tst.nvlist_to_lua.ksh \ functional/channel_program/lua_core/tst.recursive_neg.ksh \ functional/channel_program/lua_core/tst.recursive_pos.ksh \ functional/channel_program/lua_core/tst.return_large.ksh \ functional/channel_program/lua_core/tst.return_nvlist_neg.ksh \ functional/channel_program/lua_core/tst.return_nvlist_pos.ksh \ functional/channel_program/lua_core/tst.return_recursive_table.ksh \ functional/channel_program/lua_core/tst.stack_gsub.ksh \ functional/channel_program/lua_core/tst.timeout.ksh \ functional/channel_program/synctask_core/cleanup.ksh \ functional/channel_program/synctask_core/setup.ksh \ functional/channel_program/synctask_core/tst.bookmark.copy.ksh \ functional/channel_program/synctask_core/tst.bookmark.create.ksh \ functional/channel_program/synctask_core/tst.destroy_fs.ksh \ functional/channel_program/synctask_core/tst.destroy_snap.ksh \ functional/channel_program/synctask_core/tst.get_count_and_limit.ksh \ functional/channel_program/synctask_core/tst.get_index_props.ksh \ functional/channel_program/synctask_core/tst.get_mountpoint.ksh \ functional/channel_program/synctask_core/tst.get_neg.ksh \ functional/channel_program/synctask_core/tst.get_number_props.ksh \ functional/channel_program/synctask_core/tst.get_string_props.ksh \ functional/channel_program/synctask_core/tst.get_type.ksh \ functional/channel_program/synctask_core/tst.get_userquota.ksh \ functional/channel_program/synctask_core/tst.get_written.ksh \ functional/channel_program/synctask_core/tst.inherit.ksh \ functional/channel_program/synctask_core/tst.list_bookmarks.ksh \ functional/channel_program/synctask_core/tst.list_children.ksh \ functional/channel_program/synctask_core/tst.list_clones.ksh \ functional/channel_program/synctask_core/tst.list_holds.ksh \ functional/channel_program/synctask_core/tst.list_snapshots.ksh \ functional/channel_program/synctask_core/tst.list_system_props.ksh \ functional/channel_program/synctask_core/tst.list_user_props.ksh \ functional/channel_program/synctask_core/tst.parse_args_neg.ksh \ functional/channel_program/synctask_core/tst.promote_conflict.ksh \ functional/channel_program/synctask_core/tst.promote_multiple.ksh \ functional/channel_program/synctask_core/tst.promote_simple.ksh \ functional/channel_program/synctask_core/tst.rollback_mult.ksh \ functional/channel_program/synctask_core/tst.rollback_one.ksh \ functional/channel_program/synctask_core/tst.set_props.ksh \ functional/channel_program/synctask_core/tst.snapshot_destroy.ksh \ functional/channel_program/synctask_core/tst.snapshot_neg.ksh \ functional/channel_program/synctask_core/tst.snapshot_recursive.ksh \ functional/channel_program/synctask_core/tst.snapshot_rename.ksh \ functional/channel_program/synctask_core/tst.snapshot_simple.ksh \ functional/channel_program/synctask_core/tst.terminate_by_signal.ksh \ functional/chattr/chattr_001_pos.ksh \ functional/chattr/chattr_002_neg.ksh \ functional/chattr/cleanup.ksh \ functional/chattr/setup.ksh \ functional/checksum/cleanup.ksh \ functional/checksum/filetest_001_pos.ksh \ functional/checksum/filetest_002_pos.ksh \ functional/checksum/run_blake3_test.ksh \ functional/checksum/run_edonr_test.ksh \ functional/checksum/run_sha2_test.ksh \ functional/checksum/run_skein_test.ksh \ functional/checksum/setup.ksh \ functional/clean_mirror/clean_mirror_001_pos.ksh \ functional/clean_mirror/clean_mirror_002_pos.ksh \ functional/clean_mirror/clean_mirror_003_pos.ksh \ functional/clean_mirror/clean_mirror_004_pos.ksh \ functional/clean_mirror/cleanup.ksh \ functional/clean_mirror/setup.ksh \ functional/cli_root/zdb/zdb_002_pos.ksh \ functional/cli_root/zdb/zdb_003_pos.ksh \ functional/cli_root/zdb/zdb_004_pos.ksh \ functional/cli_root/zdb/zdb_005_pos.ksh \ functional/cli_root/zdb/zdb_006_pos.ksh \ functional/cli_root/zdb/zdb_args_neg.ksh \ functional/cli_root/zdb/zdb_args_pos.ksh \ functional/cli_root/zdb/zdb_backup.ksh \ functional/cli_root/zdb/zdb_block_size_histogram.ksh \ functional/cli_root/zdb/zdb_checksum.ksh \ functional/cli_root/zdb/zdb_decompress.ksh \ functional/cli_root/zdb/zdb_decompress_zstd.ksh \ functional/cli_root/zdb/zdb_display_block.ksh \ functional/cli_root/zdb/zdb_encrypted.ksh \ functional/cli_root/zdb/zdb_label_checksum.ksh \ functional/cli_root/zdb/zdb_object_range_neg.ksh \ functional/cli_root/zdb/zdb_object_range_pos.ksh \ functional/cli_root/zdb/zdb_objset_id.ksh \ functional/cli_root/zdb/zdb_recover_2.ksh \ functional/cli_root/zdb/zdb_recover.ksh \ functional/cli_root/zfs_bookmark/cleanup.ksh \ functional/cli_root/zfs_bookmark/setup.ksh \ functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh \ functional/cli_root/zfs_change-key/cleanup.ksh \ functional/cli_root/zfs_change-key/setup.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_child.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_format.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh \ functional/cli_root/zfs_change-key/zfs_change-key.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_load.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_location.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh \ functional/cli_root/zfs/cleanup.ksh \ functional/cli_root/zfs_clone/cleanup.ksh \ functional/cli_root/zfs_clone/setup.ksh \ functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_deeply_nested.ksh \ functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh \ functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh \ functional/cli_root/zfs_copies/cleanup.ksh \ functional/cli_root/zfs_copies/setup.ksh \ functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_004_neg.ksh \ functional/cli_root/zfs_copies/zfs_copies_005_neg.ksh \ functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh \ functional/cli_root/zfs_create/cleanup.ksh \ functional/cli_root/zfs_create/setup.ksh \ functional/cli_root/zfs_create/zfs_create_001_pos.ksh \ functional/cli_root/zfs_create/zfs_create_002_pos.ksh \ functional/cli_root/zfs_create/zfs_create_003_pos.ksh \ functional/cli_root/zfs_create/zfs_create_004_pos.ksh \ functional/cli_root/zfs_create/zfs_create_005_pos.ksh \ functional/cli_root/zfs_create/zfs_create_006_pos.ksh \ functional/cli_root/zfs_create/zfs_create_007_pos.ksh \ functional/cli_root/zfs_create/zfs_create_008_neg.ksh \ functional/cli_root/zfs_create/zfs_create_009_neg.ksh \ functional/cli_root/zfs_create/zfs_create_010_neg.ksh \ functional/cli_root/zfs_create/zfs_create_011_pos.ksh \ functional/cli_root/zfs_create/zfs_create_012_pos.ksh \ functional/cli_root/zfs_create/zfs_create_013_pos.ksh \ functional/cli_root/zfs_create/zfs_create_014_pos.ksh \ functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh \ functional/cli_root/zfs_create/zfs_create_dryrun.ksh \ functional/cli_root/zfs_create/zfs_create_encrypted.ksh \ functional/cli_root/zfs_create/zfs_create_nomount.ksh \ functional/cli_root/zfs_create/zfs_create_verbose.ksh \ functional/cli_root/zfs_destroy/cleanup.ksh \ functional/cli_root/zfs_destroy/setup.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_006_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_008_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_009_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_011_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_012_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_013_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh \ functional/cli_root/zfs_diff/cleanup.ksh \ functional/cli_root/zfs_diff/setup.ksh \ functional/cli_root/zfs_diff/zfs_diff_changes.ksh \ functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh \ functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh \ functional/cli_root/zfs_diff/zfs_diff_mangle.ksh \ functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh \ functional/cli_root/zfs_diff/zfs_diff_types.ksh \ functional/cli_root/zfs_get/cleanup.ksh \ functional/cli_root/zfs_get/setup.ksh \ functional/cli_root/zfs_get/zfs_get_001_pos.ksh \ functional/cli_root/zfs_get/zfs_get_002_pos.ksh \ functional/cli_root/zfs_get/zfs_get_003_pos.ksh \ functional/cli_root/zfs_get/zfs_get_004_pos.ksh \ functional/cli_root/zfs_get/zfs_get_005_neg.ksh \ functional/cli_root/zfs_get/zfs_get_006_neg.ksh \ functional/cli_root/zfs_get/zfs_get_007_neg.ksh \ functional/cli_root/zfs_get/zfs_get_008_pos.ksh \ functional/cli_root/zfs_get/zfs_get_009_pos.ksh \ functional/cli_root/zfs_get/zfs_get_010_neg.ksh \ functional/cli_root/zfs_ids_to_path/cleanup.ksh \ functional/cli_root/zfs_ids_to_path/setup.ksh \ functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh \ functional/cli_root/zfs_inherit/cleanup.ksh \ functional/cli_root/zfs_inherit/setup.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_001_neg.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_mountpoint.ksh \ functional/cli_root/zfs_jail/cleanup.ksh \ functional/cli_root/zfs_jail/setup.ksh \ functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh \ functional/cli_root/zfs_load-key/cleanup.ksh \ functional/cli_root/zfs_load-key/setup.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_all.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_file.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_https.ksh \ functional/cli_root/zfs_load-key/zfs_load-key.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_location.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh \ functional/cli_root/zfs_mount/cleanup.ksh \ functional/cli_root/zfs_mount/setup.ksh \ functional/cli_root/zfs_mount/zfs_mount_001_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_002_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_003_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_004_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_009_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_001_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \ functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \ functional/cli_root/zfs_mount/zfs_mount_remount.ksh \ functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \ functional/cli_root/zfs_mount/zfs_multi_mount.ksh \ functional/cli_root/zfs_program/cleanup.ksh \ functional/cli_root/zfs_program/setup.ksh \ functional/cli_root/zfs_program/zfs_program_json.ksh \ functional/cli_root/zfs_promote/cleanup.ksh \ functional/cli_root/zfs_promote/setup.ksh \ functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_002_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh \ functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh \ functional/cli_root/zfs_promote/zfs_promote_008_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh \ functional/cli_root/zfs_property/cleanup.ksh \ functional/cli_root/zfs_property/setup.ksh \ functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh \ functional/cli_root/zfs_receive/cleanup.ksh \ functional/cli_root/zfs_receive/receive-o-x_props_aliases.ksh \ functional/cli_root/zfs_receive/receive-o-x_props_override.ksh \ functional/cli_root/zfs_receive/setup.ksh \ functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_006_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_007_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_008_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_011_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_012_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_015_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_-e.ksh \ functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh \ functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh \ functional/cli_root/zfs_receive/zfs_receive_new_props.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw.ksh \ functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh \ functional/cli_root/zfs_receive/zfs_receive_-wR-encrypted-mix.ksh \ functional/cli_root/zfs_receive/zfs_receive_corrective.ksh \ functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh \ functional/cli_root/zfs_receive/zfs_receive_large_block_corrective.ksh \ functional/cli_root/zfs_rename/cleanup.ksh \ functional/cli_root/zfs_rename/setup.ksh \ functional/cli_root/zfs_rename/zfs_rename_001_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_002_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_004_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_009_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_010_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_012_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh \ functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh \ functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh \ functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh \ functional/cli_root/zfs_reservation/cleanup.ksh \ functional/cli_root/zfs_reservation/setup.ksh \ functional/cli_root/zfs_reservation/zfs_reservation_001_pos.ksh \ functional/cli_root/zfs_reservation/zfs_reservation_002_pos.ksh \ functional/cli_root/zfs_rollback/cleanup.ksh \ functional/cli_root/zfs_rollback/setup.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_002_pos.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh \ functional/cli_root/zfs_send/cleanup.ksh \ functional/cli_root/zfs_send/setup.ksh \ functional/cli_root/zfs_send/zfs_send_001_pos.ksh \ functional/cli_root/zfs_send/zfs_send_002_pos.ksh \ functional/cli_root/zfs_send/zfs_send_003_pos.ksh \ functional/cli_root/zfs_send/zfs_send_004_neg.ksh \ functional/cli_root/zfs_send/zfs_send_005_pos.ksh \ functional/cli_root/zfs_send/zfs_send_006_pos.ksh \ functional/cli_root/zfs_send/zfs_send_007_pos.ksh \ functional/cli_root/zfs_send/zfs_send-b.ksh \ functional/cli_root/zfs_send/zfs_send_encrypted.ksh \ functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh \ functional/cli_root/zfs_send/zfs_send_raw.ksh \ functional/cli_root/zfs_send/zfs_send_skip_missing.ksh \ functional/cli_root/zfs_send/zfs_send_sparse.ksh \ functional/cli_root/zfs_set/cache_001_pos.ksh \ functional/cli_root/zfs_set/cache_002_neg.ksh \ functional/cli_root/zfs_set/canmount_001_pos.ksh \ functional/cli_root/zfs_set/canmount_002_pos.ksh \ functional/cli_root/zfs_set/canmount_003_pos.ksh \ functional/cli_root/zfs_set/canmount_004_pos.ksh \ functional/cli_root/zfs_set/checksum_001_pos.ksh \ functional/cli_root/zfs_set/cleanup.ksh \ functional/cli_root/zfs_set/compression_001_pos.ksh \ functional/cli_root/zfs_set/mountpoint_001_pos.ksh \ functional/cli_root/zfs_set/mountpoint_002_pos.ksh \ functional/cli_root/zfs_set/mountpoint_003_pos.ksh \ functional/cli_root/zfs_set/onoffs_001_pos.ksh \ functional/cli_root/zfs_set/property_alias_001_pos.ksh \ functional/cli_root/zfs_set/readonly_001_pos.ksh \ functional/cli_root/zfs_set/reservation_001_neg.ksh \ functional/cli_root/zfs_set/ro_props_001_pos.ksh \ functional/cli_root/zfs_set/setup.ksh \ functional/cli_root/zfs_set/share_mount_001_neg.ksh \ functional/cli_root/zfs_set/snapdir_001_pos.ksh \ functional/cli_root/zfs/setup.ksh \ functional/cli_root/zfs_set/user_property_001_pos.ksh \ functional/cli_root/zfs_set/user_property_002_pos.ksh \ functional/cli_root/zfs_set/user_property_003_neg.ksh \ functional/cli_root/zfs_set/user_property_004_pos.ksh \ functional/cli_root/zfs_set/version_001_neg.ksh \ functional/cli_root/zfs_set/zfs_set_001_neg.ksh \ functional/cli_root/zfs_set/zfs_set_002_neg.ksh \ functional/cli_root/zfs_set/zfs_set_003_neg.ksh \ functional/cli_root/zfs_set/zfs_set_feature_activation.ksh \ functional/cli_root/zfs_set/zfs_set_keylocation.ksh \ functional/cli_root/zfs_set/zfs_set_nomount.ksh \ functional/cli_root/zfs_share/cleanup.ksh \ functional/cli_root/zfs_share/setup.ksh \ functional/cli_root/zfs_share/zfs_share_001_pos.ksh \ functional/cli_root/zfs_share/zfs_share_002_pos.ksh \ functional/cli_root/zfs_share/zfs_share_003_pos.ksh \ functional/cli_root/zfs_share/zfs_share_004_pos.ksh \ functional/cli_root/zfs_share/zfs_share_005_pos.ksh \ functional/cli_root/zfs_share/zfs_share_006_pos.ksh \ functional/cli_root/zfs_share/zfs_share_007_neg.ksh \ functional/cli_root/zfs_share/zfs_share_008_neg.ksh \ functional/cli_root/zfs_share/zfs_share_009_neg.ksh \ functional/cli_root/zfs_share/zfs_share_010_neg.ksh \ functional/cli_root/zfs_share/zfs_share_011_pos.ksh \ functional/cli_root/zfs_share/zfs_share_012_pos.ksh \ functional/cli_root/zfs_share/zfs_share_013_pos.ksh \ functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh \ functional/cli_root/zfs_snapshot/cleanup.ksh \ functional/cli_root/zfs_snapshot/setup.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_003_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh \ functional/cli_root/zfs_sysfs/cleanup.ksh \ functional/cli_root/zfs_sysfs/setup.ksh \ functional/cli_root/zfs_sysfs/zfeature_set_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_get_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_set_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_sysfs_live.ksh \ functional/cli_root/zfs_sysfs/zpool_get_unsupported.ksh \ functional/cli_root/zfs_sysfs/zpool_set_unsupported.ksh \ functional/cli_root/zfs_unload-key/cleanup.ksh \ functional/cli_root/zfs_unload-key/setup.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh \ functional/cli_root/zfs_unmount/cleanup.ksh \ functional/cli_root/zfs_unmount/setup.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_002_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_003_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_004_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_005_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_006_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_007_neg.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_all_001_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh \ functional/cli_root/zfs_unshare/cleanup.ksh \ functional/cli_root/zfs_unshare/setup.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_005_neg.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_008_pos.ksh \ functional/cli_root/zfs_upgrade/cleanup.ksh \ functional/cli_root/zfs_upgrade/setup.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_002_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_006_neg.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_007_neg.ksh \ functional/cli_root/zfs_wait/cleanup.ksh \ functional/cli_root/zfs_wait/setup.ksh \ functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh \ functional/cli_root/zfs_wait/zfs_wait_getsubopt.ksh \ functional/cli_root/zfs/zfs_001_neg.ksh \ functional/cli_root/zfs/zfs_002_pos.ksh \ functional/cli_root/zfs/zfs_003_neg.ksh \ functional/cli_root/zhack/zhack_label_repair_001.ksh \ functional/cli_root/zhack/zhack_label_repair_002.ksh \ functional/cli_root/zhack/zhack_label_repair_003.ksh \ functional/cli_root/zhack/zhack_label_repair_004.ksh \ functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \ functional/cli_root/zpool_add/add-o_ashift.ksh \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ functional/cli_root/zpool_add/cleanup.ksh \ functional/cli_root/zpool_add/setup.ksh \ functional/cli_root/zpool_add/zpool_add_001_pos.ksh \ functional/cli_root/zpool_add/zpool_add_002_pos.ksh \ functional/cli_root/zpool_add/zpool_add_003_pos.ksh \ functional/cli_root/zpool_add/zpool_add_004_pos.ksh \ functional/cli_root/zpool_add/zpool_add_005_pos.ksh \ functional/cli_root/zpool_add/zpool_add_006_pos.ksh \ functional/cli_root/zpool_add/zpool_add_007_neg.ksh \ functional/cli_root/zpool_add/zpool_add_008_neg.ksh \ functional/cli_root/zpool_add/zpool_add_009_neg.ksh \ functional/cli_root/zpool_add/zpool_add_010_pos.ksh \ functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \ functional/cli_root/zpool_attach/attach-o_ashift.ksh \ functional/cli_root/zpool_attach/cleanup.ksh \ functional/cli_root/zpool_attach/setup.ksh \ functional/cli_root/zpool_attach/zpool_attach_001_neg.ksh \ functional/cli_root/zpool/cleanup.ksh \ functional/cli_root/zpool_clear/cleanup.ksh \ functional/cli_root/zpool_clear/setup.ksh \ functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh \ functional/cli_root/zpool_clear/zpool_clear_002_neg.ksh \ functional/cli_root/zpool_clear/zpool_clear_003_neg.ksh \ functional/cli_root/zpool_clear/zpool_clear_readonly.ksh \ functional/cli_root/zpool_create/cleanup.ksh \ functional/cli_root/zpool_create/create-o_ashift.ksh \ functional/cli_root/zpool_create/setup.ksh \ functional/cli_root/zpool_create/zpool_create_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_004_pos.ksh \ functional/cli_root/zpool_create/zpool_create_005_pos.ksh \ functional/cli_root/zpool_create/zpool_create_006_pos.ksh \ functional/cli_root/zpool_create/zpool_create_007_neg.ksh \ functional/cli_root/zpool_create/zpool_create_008_pos.ksh \ functional/cli_root/zpool_create/zpool_create_009_neg.ksh \ functional/cli_root/zpool_create/zpool_create_010_neg.ksh \ functional/cli_root/zpool_create/zpool_create_011_neg.ksh \ functional/cli_root/zpool_create/zpool_create_012_neg.ksh \ functional/cli_root/zpool_create/zpool_create_014_neg.ksh \ functional/cli_root/zpool_create/zpool_create_015_neg.ksh \ functional/cli_root/zpool_create/zpool_create_016_pos.ksh \ functional/cli_root/zpool_create/zpool_create_017_neg.ksh \ functional/cli_root/zpool_create/zpool_create_018_pos.ksh \ functional/cli_root/zpool_create/zpool_create_019_pos.ksh \ functional/cli_root/zpool_create/zpool_create_020_pos.ksh \ functional/cli_root/zpool_create/zpool_create_021_pos.ksh \ functional/cli_root/zpool_create/zpool_create_022_pos.ksh \ functional/cli_root/zpool_create/zpool_create_023_neg.ksh \ functional/cli_root/zpool_create/zpool_create_024_pos.ksh \ functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh \ functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh \ functional/cli_root/zpool_create/zpool_create_encrypted.ksh \ functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_004_neg.ksh \ functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_006_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_008_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_009_pos.ksh \ functional/cli_root/zpool_create/zpool_create_tempname.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_003_neg.ksh \ functional/cli_root/zpool_detach/cleanup.ksh \ functional/cli_root/zpool_detach/setup.ksh \ functional/cli_root/zpool_detach/zpool_detach_001_neg.ksh \ functional/cli_root/zpool_events/cleanup.ksh \ functional/cli_root/zpool_events/setup.ksh \ functional/cli_root/zpool_events/zpool_events_clear.ksh \ functional/cli_root/zpool_events/zpool_events_clear_retained.ksh \ functional/cli_root/zpool_events/zpool_events_cliargs.ksh \ functional/cli_root/zpool_events/zpool_events_duplicates.ksh \ functional/cli_root/zpool_events/zpool_events_errors.ksh \ functional/cli_root/zpool_events/zpool_events_follow.ksh \ functional/cli_root/zpool_events/zpool_events_poolname.ksh \ functional/cli_root/zpool_expand/cleanup.ksh \ functional/cli_root/zpool_expand/setup.ksh \ functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh \ functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh \ functional/cli_root/zpool_export/cleanup.ksh \ functional/cli_root/zpool_export/setup.ksh \ functional/cli_root/zpool_export/zpool_export_001_pos.ksh \ functional/cli_root/zpool_export/zpool_export_002_pos.ksh \ functional/cli_root/zpool_export/zpool_export_003_neg.ksh \ functional/cli_root/zpool_export/zpool_export_004_pos.ksh \ functional/cli_root/zpool_get/cleanup.ksh \ functional/cli_root/zpool_get/setup.ksh \ functional/cli_root/zpool_get/vdev_get_001_pos.ksh \ functional/cli_root/zpool_get/zpool_get_001_pos.ksh \ functional/cli_root/zpool_get/zpool_get_002_pos.ksh \ functional/cli_root/zpool_get/zpool_get_003_pos.ksh \ functional/cli_root/zpool_get/zpool_get_004_neg.ksh \ functional/cli_root/zpool_get/zpool_get_005_pos.ksh \ functional/cli_root/zpool_history/cleanup.ksh \ functional/cli_root/zpool_history/setup.ksh \ functional/cli_root/zpool_history/zpool_history_001_neg.ksh \ functional/cli_root/zpool_history/zpool_history_002_pos.ksh \ functional/cli_root/zpool_import/cleanup.ksh \ functional/cli_root/zpool_import/import_cachefile_device_added.ksh \ functional/cli_root/zpool_import/import_cachefile_device_removed.ksh \ functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh \ functional/cli_root/zpool_import/import_cachefile_mirror_attached.ksh \ functional/cli_root/zpool_import/import_cachefile_mirror_detached.ksh \ functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh \ functional/cli_root/zpool_import/import_cachefile_shared_device.ksh \ functional/cli_root/zpool_import/import_devices_missing.ksh \ functional/cli_root/zpool_import/import_log_missing.ksh \ functional/cli_root/zpool_import/import_paths_changed.ksh \ functional/cli_root/zpool_import/import_rewind_config_changed.ksh \ functional/cli_root/zpool_import/import_rewind_device_replaced.ksh \ functional/cli_root/zpool_import/setup.ksh \ functional/cli_root/zpool_import/zpool_import_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_002_pos.ksh \ functional/cli_root/zpool_import/zpool_import_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_004_pos.ksh \ functional/cli_root/zpool_import/zpool_import_005_pos.ksh \ functional/cli_root/zpool_import/zpool_import_006_pos.ksh \ functional/cli_root/zpool_import/zpool_import_007_pos.ksh \ functional/cli_root/zpool_import/zpool_import_008_pos.ksh \ functional/cli_root/zpool_import/zpool_import_009_neg.ksh \ functional/cli_root/zpool_import/zpool_import_010_pos.ksh \ functional/cli_root/zpool_import/zpool_import_011_neg.ksh \ functional/cli_root/zpool_import/zpool_import_012_pos.ksh \ functional/cli_root/zpool_import/zpool_import_013_neg.ksh \ functional/cli_root/zpool_import/zpool_import_014_pos.ksh \ functional/cli_root/zpool_import/zpool_import_015_pos.ksh \ functional/cli_root/zpool_import/zpool_import_016_pos.ksh \ functional/cli_root/zpool_import/zpool_import_017_pos.ksh \ functional/cli_root/zpool_import/zpool_import_all_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_encrypted.ksh \ functional/cli_root/zpool_import/zpool_import_encrypted_load.ksh \ functional/cli_root/zpool_import/zpool_import_errata3.ksh \ functional/cli_root/zpool_import/zpool_import_errata4.ksh \ functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \ functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \ functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh \ functional/cli_root/zpool_initialize/cleanup.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_split.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_removed.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh \ functional/cli_root/zpool_offline/cleanup.ksh \ functional/cli_root/zpool_offline/setup.ksh \ functional/cli_root/zpool_offline/zpool_offline_001_pos.ksh \ functional/cli_root/zpool_offline/zpool_offline_002_neg.ksh \ functional/cli_root/zpool_offline/zpool_offline_003_pos.ksh \ functional/cli_root/zpool_online/cleanup.ksh \ functional/cli_root/zpool_online/setup.ksh \ functional/cli_root/zpool_online/zpool_online_001_pos.ksh \ functional/cli_root/zpool_online/zpool_online_002_neg.ksh \ functional/cli_root/zpool_remove/cleanup.ksh \ functional/cli_root/zpool_remove/setup.ksh \ functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh \ functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh \ functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh \ functional/cli_root/zpool_reopen/cleanup.ksh \ functional/cli_root/zpool_reopen/setup.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_001_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_002_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_006_neg.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh \ functional/cli_root/zpool_replace/cleanup.ksh \ functional/cli_root/zpool_replace/replace-o_ashift.ksh \ functional/cli_root/zpool_replace/replace_prop_ashift.ksh \ functional/cli_root/zpool_replace/setup.ksh \ functional/cli_root/zpool_replace/zpool_replace_001_neg.ksh \ functional/cli_root/zpool_resilver/cleanup.ksh \ functional/cli_root/zpool_resilver/setup.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \ functional/cli_root/zpool_scrub/cleanup.ksh \ functional/cli_root/zpool_scrub/setup.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh \ functional/cli_root/zpool_set/cleanup.ksh \ functional/cli_root/zpool_set/setup.ksh \ functional/cli_root/zpool/setup.ksh \ functional/cli_root/zpool_set/vdev_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_common.kshlib \ functional/cli_root/zpool_set/zpool_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_003_neg.ksh \ functional/cli_root/zpool_set/zpool_set_ashift.ksh \ functional/cli_root/zpool_set/user_property_001_pos.ksh \ functional/cli_root/zpool_set/user_property_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_features.ksh \ functional/cli_root/zpool_split/cleanup.ksh \ functional/cli_root/zpool_split/setup.ksh \ functional/cli_root/zpool_split/zpool_split_cliargs.ksh \ functional/cli_root/zpool_split/zpool_split_devices.ksh \ functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh \ functional/cli_root/zpool_split/zpool_split_encryption.ksh \ functional/cli_root/zpool_split/zpool_split_indirect.ksh \ functional/cli_root/zpool_split/zpool_split_props.ksh \ functional/cli_root/zpool_split/zpool_split_resilver.ksh \ functional/cli_root/zpool_split/zpool_split_vdevs.ksh \ functional/cli_root/zpool_split/zpool_split_wholedisk.ksh \ functional/cli_root/zpool_status/cleanup.ksh \ functional/cli_root/zpool_status/setup.ksh \ functional/cli_root/zpool_status/zpool_status_001_pos.ksh \ functional/cli_root/zpool_status/zpool_status_002_pos.ksh \ functional/cli_root/zpool_status/zpool_status_003_pos.ksh \ functional/cli_root/zpool_status/zpool_status_004_pos.ksh \ functional/cli_root/zpool_status/zpool_status_005_pos.ksh \ functional/cli_root/zpool_status/zpool_status_006_pos.ksh \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \ functional/cli_root/zpool_sync/zpool_sync_002_neg.ksh \ functional/cli_root/zpool_trim/cleanup.ksh \ functional/cli_root/zpool_trim/setup.ksh \ functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh \ functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh \ functional/cli_root/zpool_trim/zpool_trim_import_export.ksh \ functional/cli_root/zpool_trim/zpool_trim_multiple.ksh \ functional/cli_root/zpool_trim/zpool_trim_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh \ functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh \ functional/cli_root/zpool_trim/zpool_trim_partial.ksh \ functional/cli_root/zpool_trim/zpool_trim_rate.ksh \ functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_secure.ksh \ functional/cli_root/zpool_trim/zpool_trim_split.ksh \ functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh \ functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh \ functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh \ functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh \ functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh \ functional/cli_root/zpool_upgrade/cleanup.ksh \ functional/cli_root/zpool_upgrade/setup.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_001_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_002_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_003_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_004_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_005_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_006_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_009_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_features_001_pos.ksh \ functional/cli_root/zpool_wait/cleanup.ksh \ functional/cli_root/zpool_wait/scan/cleanup.ksh \ functional/cli_root/zpool_wait/scan/setup.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh \ functional/cli_root/zpool_wait/setup.ksh \ functional/cli_root/zpool_wait/zpool_wait_discard.ksh \ functional/cli_root/zpool_wait/zpool_wait_freeing.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh \ functional/cli_root/zpool_wait/zpool_wait_multiple.ksh \ functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh \ functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_remove.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh \ functional/cli_root/zpool_wait/zpool_wait_usage.ksh \ functional/cli_root/zpool/zpool_001_neg.ksh \ functional/cli_root/zpool/zpool_002_pos.ksh \ functional/cli_root/zpool/zpool_003_pos.ksh \ functional/cli_root/zpool/zpool_colors.ksh \ functional/cli_user/misc/arcstat_001_pos.ksh \ functional/cli_user/misc/arc_summary_001_pos.ksh \ functional/cli_user/misc/arc_summary_002_neg.ksh \ functional/cli_user/misc/zilstat_001_pos.ksh \ functional/cli_user/misc/cleanup.ksh \ functional/cli_user/misc/setup.ksh \ functional/cli_user/misc/zdb_001_neg.ksh \ functional/cli_user/misc/zfs_001_neg.ksh \ functional/cli_user/misc/zfs_allow_001_neg.ksh \ functional/cli_user/misc/zfs_clone_001_neg.ksh \ functional/cli_user/misc/zfs_create_001_neg.ksh \ functional/cli_user/misc/zfs_destroy_001_neg.ksh \ functional/cli_user/misc/zfs_get_001_neg.ksh \ functional/cli_user/misc/zfs_inherit_001_neg.ksh \ functional/cli_user/misc/zfs_mount_001_neg.ksh \ functional/cli_user/misc/zfs_promote_001_neg.ksh \ functional/cli_user/misc/zfs_receive_001_neg.ksh \ functional/cli_user/misc/zfs_rename_001_neg.ksh \ functional/cli_user/misc/zfs_rollback_001_neg.ksh \ functional/cli_user/misc/zfs_send_001_neg.ksh \ functional/cli_user/misc/zfs_set_001_neg.ksh \ functional/cli_user/misc/zfs_share_001_neg.ksh \ functional/cli_user/misc/zfs_snapshot_001_neg.ksh \ functional/cli_user/misc/zfs_unallow_001_neg.ksh \ functional/cli_user/misc/zfs_unmount_001_neg.ksh \ functional/cli_user/misc/zfs_unshare_001_neg.ksh \ functional/cli_user/misc/zfs_upgrade_001_neg.ksh \ functional/cli_user/misc/zpool_001_neg.ksh \ functional/cli_user/misc/zpool_add_001_neg.ksh \ functional/cli_user/misc/zpool_attach_001_neg.ksh \ functional/cli_user/misc/zpool_clear_001_neg.ksh \ functional/cli_user/misc/zpool_create_001_neg.ksh \ functional/cli_user/misc/zpool_destroy_001_neg.ksh \ functional/cli_user/misc/zpool_detach_001_neg.ksh \ functional/cli_user/misc/zpool_export_001_neg.ksh \ functional/cli_user/misc/zpool_get_001_neg.ksh \ functional/cli_user/misc/zpool_history_001_neg.ksh \ functional/cli_user/misc/zpool_import_001_neg.ksh \ functional/cli_user/misc/zpool_import_002_neg.ksh \ functional/cli_user/misc/zpool_offline_001_neg.ksh \ functional/cli_user/misc/zpool_online_001_neg.ksh \ functional/cli_user/misc/zpool_remove_001_neg.ksh \ functional/cli_user/misc/zpool_replace_001_neg.ksh \ functional/cli_user/misc/zpool_scrub_001_neg.ksh \ functional/cli_user/misc/zpool_set_001_neg.ksh \ functional/cli_user/misc/zpool_status_001_neg.ksh \ functional/cli_user/misc/zpool_upgrade_001_neg.ksh \ functional/cli_user/misc/zpool_wait_privilege.ksh \ functional/cli_user/zfs_list/cleanup.ksh \ functional/cli_user/zfs_list/setup.ksh \ functional/cli_user/zfs_list/zfs_list_001_pos.ksh \ functional/cli_user/zfs_list/zfs_list_002_pos.ksh \ functional/cli_user/zfs_list/zfs_list_003_pos.ksh \ functional/cli_user/zfs_list/zfs_list_004_neg.ksh \ functional/cli_user/zfs_list/zfs_list_005_neg.ksh \ functional/cli_user/zfs_list/zfs_list_007_pos.ksh \ functional/cli_user/zfs_list/zfs_list_008_neg.ksh \ functional/cli_user/zpool_iostat/cleanup.ksh \ functional/cli_user/zpool_iostat/setup.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_001_neg.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_disable.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh \ functional/cli_user/zpool_list/cleanup.ksh \ functional/cli_user/zpool_list/setup.ksh \ functional/cli_user/zpool_list/zpool_list_001_pos.ksh \ functional/cli_user/zpool_list/zpool_list_002_neg.ksh \ functional/cli_user/zpool_status/cleanup.ksh \ functional/cli_user/zpool_status/setup.ksh \ functional/cli_user/zpool_status/zpool_status_003_pos.ksh \ functional/cli_user/zpool_status/zpool_status_-c_disable.ksh \ functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh \ functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh \ functional/compression/cleanup.ksh \ functional/compression/compress_001_pos.ksh \ functional/compression/compress_002_pos.ksh \ functional/compression/compress_003_pos.ksh \ functional/compression/compress_004_pos.ksh \ functional/compression/compress_zstd_bswap.ksh \ functional/compression/l2arc_compressed_arc_disabled.ksh \ functional/compression/l2arc_compressed_arc.ksh \ functional/compression/l2arc_encrypted.ksh \ functional/compression/l2arc_encrypted_no_compressed_arc.ksh \ functional/compression/setup.ksh \ functional/cp_files/cleanup.ksh \ functional/cp_files/cp_files_001_pos.ksh \ functional/cp_files/setup.ksh \ functional/crtime/cleanup.ksh \ functional/crtime/crtime_001_pos.ksh \ functional/crtime/setup.ksh \ functional/ctime/cleanup.ksh \ functional/ctime/ctime_001_pos.ksh \ functional/ctime/setup.ksh \ functional/deadman/deadman_ratelimit.ksh \ functional/deadman/deadman_sync.ksh \ functional/deadman/deadman_zio.ksh \ functional/delegate/cleanup.ksh \ functional/delegate/setup.ksh \ functional/delegate/zfs_allow_001_pos.ksh \ functional/delegate/zfs_allow_002_pos.ksh \ functional/delegate/zfs_allow_003_pos.ksh \ functional/delegate/zfs_allow_004_pos.ksh \ functional/delegate/zfs_allow_005_pos.ksh \ functional/delegate/zfs_allow_006_pos.ksh \ functional/delegate/zfs_allow_007_pos.ksh \ functional/delegate/zfs_allow_008_pos.ksh \ functional/delegate/zfs_allow_009_neg.ksh \ functional/delegate/zfs_allow_010_pos.ksh \ functional/delegate/zfs_allow_011_neg.ksh \ functional/delegate/zfs_allow_012_neg.ksh \ functional/delegate/zfs_unallow_001_pos.ksh \ functional/delegate/zfs_unallow_002_pos.ksh \ functional/delegate/zfs_unallow_003_pos.ksh \ functional/delegate/zfs_unallow_004_pos.ksh \ functional/delegate/zfs_unallow_005_pos.ksh \ functional/delegate/zfs_unallow_006_pos.ksh \ functional/delegate/zfs_unallow_007_neg.ksh \ functional/delegate/zfs_unallow_008_neg.ksh \ functional/devices/cleanup.ksh \ functional/devices/devices_001_pos.ksh \ functional/devices/devices_002_neg.ksh \ functional/devices/devices_003_pos.ksh \ functional/devices/setup.ksh \ functional/dos_attributes/cleanup.ksh \ functional/dos_attributes/read_dos_attrs_001.ksh \ functional/dos_attributes/setup.ksh \ functional/dos_attributes/write_dos_attrs_001.ksh \ functional/events/cleanup.ksh \ functional/events/events_001_pos.ksh \ functional/events/events_002_pos.ksh \ functional/events/setup.ksh \ functional/events/zed_cksum_config.ksh \ functional/events/zed_cksum_reported.ksh \ functional/events/zed_fd_spill.ksh \ functional/events/zed_io_config.ksh \ functional/events/zed_rc_filter.ksh \ functional/exec/cleanup.ksh \ functional/exec/exec_001_pos.ksh \ functional/exec/exec_002_neg.ksh \ functional/exec/setup.ksh \ functional/fadvise/cleanup.ksh \ functional/fadvise/fadvise_sequential.ksh \ functional/fadvise/setup.ksh \ functional/fallocate/cleanup.ksh \ functional/fallocate/fallocate_prealloc.ksh \ functional/fallocate/fallocate_punch-hole.ksh \ functional/fallocate/fallocate_zero-range.ksh \ functional/fallocate/setup.ksh \ functional/fault/auto_offline_001_pos.ksh \ functional/fault/auto_online_001_pos.ksh \ functional/fault/auto_online_002_pos.ksh \ functional/fault/auto_replace_001_pos.ksh \ + functional/fault/auto_replace_002_pos.ksh \ functional/fault/auto_spare_001_pos.ksh \ functional/fault/auto_spare_002_pos.ksh \ functional/fault/auto_spare_ashift.ksh \ functional/fault/auto_spare_multiple.ksh \ functional/fault/auto_spare_shared.ksh \ functional/fault/cleanup.ksh \ functional/fault/decompress_fault.ksh \ functional/fault/decrypt_fault.ksh \ functional/fault/scrub_after_resilver.ksh \ functional/fault/setup.ksh \ functional/fault/zpool_status_-s.ksh \ functional/features/async_destroy/async_destroy_001_pos.ksh \ functional/features/async_destroy/cleanup.ksh \ functional/features/async_destroy/setup.ksh \ functional/features/large_dnode/cleanup.ksh \ functional/features/large_dnode/large_dnode_001_pos.ksh \ functional/features/large_dnode/large_dnode_002_pos.ksh \ functional/features/large_dnode/large_dnode_003_pos.ksh \ functional/features/large_dnode/large_dnode_004_neg.ksh \ functional/features/large_dnode/large_dnode_005_pos.ksh \ functional/features/large_dnode/large_dnode_006_pos.ksh \ functional/features/large_dnode/large_dnode_007_neg.ksh \ functional/features/large_dnode/large_dnode_008_pos.ksh \ functional/features/large_dnode/large_dnode_009_pos.ksh \ functional/features/large_dnode/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ functional/history/cleanup.ksh \ functional/history/history_001_pos.ksh \ functional/history/history_002_pos.ksh \ functional/history/history_003_pos.ksh \ functional/history/history_004_pos.ksh \ functional/history/history_005_neg.ksh \ functional/history/history_006_neg.ksh \ functional/history/history_007_pos.ksh \ functional/history/history_008_pos.ksh \ functional/history/history_009_pos.ksh \ functional/history/history_010_pos.ksh \ functional/history/setup.ksh \ functional/inheritance/cleanup.ksh \ functional/inheritance/inherit_001_pos.ksh \ functional/inuse/inuse_001_pos.ksh \ functional/inuse/inuse_003_pos.ksh \ functional/inuse/inuse_004_pos.ksh \ functional/inuse/inuse_005_pos.ksh \ functional/inuse/inuse_006_pos.ksh \ functional/inuse/inuse_007_pos.ksh \ functional/inuse/inuse_008_pos.ksh \ functional/inuse/inuse_009_pos.ksh \ functional/inuse/setup.ksh \ functional/io/cleanup.ksh \ functional/io/io_uring.ksh \ functional/io/libaio.ksh \ functional/io/mmap.ksh \ functional/io/posixaio.ksh \ functional/io/psync.ksh \ functional/io/setup.ksh \ functional/io/sync.ksh \ functional/l2arc/cleanup.ksh \ functional/l2arc/l2arc_arcstats_pos.ksh \ functional/l2arc/l2arc_l2miss_pos.ksh \ functional/l2arc/l2arc_mfuonly_pos.ksh \ functional/l2arc/persist_l2arc_001_pos.ksh \ functional/l2arc/persist_l2arc_002_pos.ksh \ functional/l2arc/persist_l2arc_003_neg.ksh \ functional/l2arc/persist_l2arc_004_pos.ksh \ functional/l2arc/persist_l2arc_005_pos.ksh \ functional/l2arc/setup.ksh \ functional/large_files/cleanup.ksh \ functional/large_files/large_files_001_pos.ksh \ functional/large_files/large_files_002_pos.ksh \ functional/large_files/setup.ksh \ functional/largest_pool/largest_pool_001_pos.ksh \ functional/libzfs/cleanup.ksh \ functional/libzfs/libzfs_input.ksh \ functional/libzfs/setup.ksh \ functional/limits/cleanup.ksh \ functional/limits/filesystem_count.ksh \ functional/limits/filesystem_limit.ksh \ functional/limits/setup.ksh \ functional/limits/snapshot_count.ksh \ functional/limits/snapshot_limit.ksh \ functional/link_count/cleanup.ksh \ functional/link_count/link_count_001.ksh \ functional/link_count/link_count_root_inode.ksh \ functional/link_count/setup.ksh \ functional/log_spacemap/log_spacemap_import_logs.ksh \ functional/migration/cleanup.ksh \ functional/migration/migration_001_pos.ksh \ functional/migration/migration_002_pos.ksh \ functional/migration/migration_003_pos.ksh \ functional/migration/migration_004_pos.ksh \ functional/migration/migration_005_pos.ksh \ functional/migration/migration_006_pos.ksh \ functional/migration/migration_007_pos.ksh \ functional/migration/migration_008_pos.ksh \ functional/migration/migration_009_pos.ksh \ functional/migration/migration_010_pos.ksh \ functional/migration/migration_011_pos.ksh \ functional/migration/migration_012_pos.ksh \ functional/migration/setup.ksh \ functional/mmap/cleanup.ksh \ functional/mmap/mmap_libaio_001_pos.ksh \ functional/mmap/mmap_mixed.ksh \ functional/mmap/mmap_read_001_pos.ksh \ functional/mmap/mmap_seek_001_pos.ksh \ functional/mmap/mmap_sync_001_pos.ksh \ functional/mmap/mmap_write_001_pos.ksh \ functional/mmap/setup.ksh \ functional/mmp/cleanup.ksh \ functional/mmp/mmp_active_import.ksh \ functional/mmp/mmp_exported_import.ksh \ functional/mmp/mmp_hostid.ksh \ functional/mmp/mmp_inactive_import.ksh \ functional/mmp/mmp_interval.ksh \ functional/mmp/mmp_on_off.ksh \ functional/mmp/mmp_on_thread.ksh \ functional/mmp/mmp_on_uberblocks.ksh \ functional/mmp/mmp_on_zdb.ksh \ functional/mmp/mmp_reset_interval.ksh \ functional/mmp/mmp_write_distribution.ksh \ functional/mmp/mmp_write_uberblocks.ksh \ functional/mmp/multihost_history.ksh \ functional/mmp/setup.ksh \ functional/mount/cleanup.ksh \ functional/mount/setup.ksh \ functional/mount/umount_001.ksh \ functional/mount/umountall_001.ksh \ functional/mount/umount_unlinked_drain.ksh \ functional/mv_files/cleanup.ksh \ functional/mv_files/mv_files_001_pos.ksh \ functional/mv_files/mv_files_002_pos.ksh \ functional/mv_files/random_creation.ksh \ functional/mv_files/setup.ksh \ functional/nestedfs/cleanup.ksh \ functional/nestedfs/nestedfs_001_pos.ksh \ functional/nestedfs/setup.ksh \ functional/nopwrite/cleanup.ksh \ functional/nopwrite/nopwrite_copies.ksh \ functional/nopwrite/nopwrite_mtime.ksh \ functional/nopwrite/nopwrite_negative.ksh \ functional/nopwrite/nopwrite_promoted_clone.ksh \ functional/nopwrite/nopwrite_recsize.ksh \ functional/nopwrite/nopwrite_sync.ksh \ functional/nopwrite/nopwrite_varying_compression.ksh \ functional/nopwrite/nopwrite_volume.ksh \ functional/nopwrite/setup.ksh \ functional/no_space/cleanup.ksh \ functional/no_space/enospc_001_pos.ksh \ functional/no_space/enospc_002_pos.ksh \ functional/no_space/enospc_003_pos.ksh \ functional/no_space/enospc_df.ksh \ functional/no_space/enospc_ganging.ksh \ functional/no_space/enospc_rm.ksh \ functional/no_space/setup.ksh \ functional/online_offline/cleanup.ksh \ functional/online_offline/online_offline_001_pos.ksh \ functional/online_offline/online_offline_002_neg.ksh \ functional/online_offline/online_offline_003_neg.ksh \ functional/online_offline/setup.ksh \ functional/pam/cleanup.ksh \ functional/pam/pam_basic.ksh \ functional/pam/pam_change_unmounted.ksh \ functional/pam/pam_nounmount.ksh \ functional/pam/pam_recursive.ksh \ functional/pam/pam_short_password.ksh \ functional/pam/setup.ksh \ functional/pool_checkpoint/checkpoint_after_rewind.ksh \ functional/pool_checkpoint/checkpoint_big_rewind.ksh \ functional/pool_checkpoint/checkpoint_capacity.ksh \ functional/pool_checkpoint/checkpoint_conf_change.ksh \ functional/pool_checkpoint/checkpoint_discard_busy.ksh \ functional/pool_checkpoint/checkpoint_discard.ksh \ functional/pool_checkpoint/checkpoint_discard_many.ksh \ functional/pool_checkpoint/checkpoint_indirect.ksh \ functional/pool_checkpoint/checkpoint_invalid.ksh \ functional/pool_checkpoint/checkpoint_lun_expsz.ksh \ functional/pool_checkpoint/checkpoint_open.ksh \ functional/pool_checkpoint/checkpoint_removal.ksh \ functional/pool_checkpoint/checkpoint_rewind.ksh \ functional/pool_checkpoint/checkpoint_ro_rewind.ksh \ functional/pool_checkpoint/checkpoint_sm_scale.ksh \ functional/pool_checkpoint/checkpoint_twice.ksh \ functional/pool_checkpoint/checkpoint_vdev_add.ksh \ functional/pool_checkpoint/checkpoint_zdb.ksh \ functional/pool_checkpoint/checkpoint_zhack_feat.ksh \ functional/pool_checkpoint/cleanup.ksh \ functional/pool_checkpoint/setup.ksh \ functional/pool_names/pool_names_001_pos.ksh \ functional/pool_names/pool_names_002_neg.ksh \ functional/poolversion/cleanup.ksh \ functional/poolversion/poolversion_001_pos.ksh \ functional/poolversion/poolversion_002_pos.ksh \ functional/poolversion/setup.ksh \ functional/privilege/cleanup.ksh \ functional/privilege/privilege_001_pos.ksh \ functional/privilege/privilege_002_pos.ksh \ functional/privilege/setup.ksh \ functional/procfs/cleanup.ksh \ functional/procfs/pool_state.ksh \ functional/procfs/procfs_list_basic.ksh \ functional/procfs/procfs_list_concurrent_readers.ksh \ functional/procfs/procfs_list_stale_read.ksh \ functional/procfs/setup.ksh \ functional/projectquota/cleanup.ksh \ functional/projectquota/projectid_001_pos.ksh \ functional/projectquota/projectid_002_pos.ksh \ functional/projectquota/projectid_003_pos.ksh \ functional/projectquota/projectquota_001_pos.ksh \ functional/projectquota/projectquota_002_pos.ksh \ functional/projectquota/projectquota_003_pos.ksh \ functional/projectquota/projectquota_004_neg.ksh \ functional/projectquota/projectquota_005_pos.ksh \ functional/projectquota/projectquota_006_pos.ksh \ functional/projectquota/projectquota_007_pos.ksh \ functional/projectquota/projectquota_008_pos.ksh \ functional/projectquota/projectquota_009_pos.ksh \ functional/projectquota/projectspace_001_pos.ksh \ functional/projectquota/projectspace_002_pos.ksh \ functional/projectquota/projectspace_003_pos.ksh \ functional/projectquota/projectspace_004_pos.ksh \ functional/projectquota/projecttree_001_pos.ksh \ functional/projectquota/projecttree_002_pos.ksh \ functional/projectquota/projecttree_003_neg.ksh \ functional/projectquota/setup.ksh \ functional/quota/cleanup.ksh \ functional/quota/quota_001_pos.ksh \ functional/quota/quota_002_pos.ksh \ functional/quota/quota_003_pos.ksh \ functional/quota/quota_004_pos.ksh \ functional/quota/quota_005_pos.ksh \ functional/quota/quota_006_neg.ksh \ functional/quota/setup.ksh \ functional/raidz/cleanup.ksh \ functional/raidz/raidz_001_neg.ksh \ functional/raidz/raidz_002_pos.ksh \ functional/raidz/raidz_003_pos.ksh \ functional/raidz/raidz_004_pos.ksh \ functional/raidz/setup.ksh \ functional/redacted_send/cleanup.ksh \ functional/redacted_send/redacted_compressed.ksh \ functional/redacted_send/redacted_contents.ksh \ functional/redacted_send/redacted_deleted.ksh \ functional/redacted_send/redacted_disabled_feature.ksh \ functional/redacted_send/redacted_embedded.ksh \ functional/redacted_send/redacted_holes.ksh \ functional/redacted_send/redacted_incrementals.ksh \ functional/redacted_send/redacted_largeblocks.ksh \ functional/redacted_send/redacted_many_clones.ksh \ functional/redacted_send/redacted_mixed_recsize.ksh \ functional/redacted_send/redacted_mounts.ksh \ functional/redacted_send/redacted_negative.ksh \ functional/redacted_send/redacted_origin.ksh \ functional/redacted_send/redacted_panic.ksh \ functional/redacted_send/redacted_props.ksh \ functional/redacted_send/redacted_resume.ksh \ functional/redacted_send/redacted_size.ksh \ functional/redacted_send/redacted_volume.ksh \ functional/redacted_send/setup.ksh \ functional/redundancy/cleanup.ksh \ functional/redundancy/redundancy_draid1.ksh \ functional/redundancy/redundancy_draid2.ksh \ functional/redundancy/redundancy_draid3.ksh \ functional/redundancy/redundancy_draid_damaged1.ksh \ functional/redundancy/redundancy_draid_damaged2.ksh \ functional/redundancy/redundancy_draid.ksh \ functional/redundancy/redundancy_draid_spare1.ksh \ functional/redundancy/redundancy_draid_spare2.ksh \ functional/redundancy/redundancy_draid_spare3.ksh \ functional/redundancy/redundancy_mirror.ksh \ functional/redundancy/redundancy_raidz1.ksh \ functional/redundancy/redundancy_raidz2.ksh \ functional/redundancy/redundancy_raidz3.ksh \ functional/redundancy/redundancy_raidz.ksh \ functional/redundancy/redundancy_stripe.ksh \ functional/redundancy/setup.ksh \ functional/refquota/cleanup.ksh \ functional/refquota/refquota_001_pos.ksh \ functional/refquota/refquota_002_pos.ksh \ functional/refquota/refquota_003_pos.ksh \ functional/refquota/refquota_004_pos.ksh \ functional/refquota/refquota_005_pos.ksh \ functional/refquota/refquota_006_neg.ksh \ functional/refquota/refquota_007_neg.ksh \ functional/refquota/refquota_008_neg.ksh \ functional/refquota/setup.ksh \ functional/refreserv/cleanup.ksh \ functional/refreserv/refreserv_001_pos.ksh \ functional/refreserv/refreserv_002_pos.ksh \ functional/refreserv/refreserv_003_pos.ksh \ functional/refreserv/refreserv_004_pos.ksh \ functional/refreserv/refreserv_005_pos.ksh \ functional/refreserv/refreserv_multi_raidz.ksh \ functional/refreserv/refreserv_raidz.ksh \ functional/refreserv/setup.ksh \ functional/removal/cleanup.ksh \ functional/removal/removal_all_vdev.ksh \ functional/removal/removal_cancel.ksh \ functional/removal/removal_check_space.ksh \ functional/removal/removal_condense_export.ksh \ functional/removal/removal_multiple_indirection.ksh \ functional/removal/removal_nopwrite.ksh \ functional/removal/removal_remap_deadlists.ksh \ functional/removal/removal_reservation.ksh \ functional/removal/removal_resume_export.ksh \ functional/removal/removal_sanity.ksh \ functional/removal/removal_with_add.ksh \ functional/removal/removal_with_create_fs.ksh \ functional/removal/removal_with_dedup.ksh \ functional/removal/removal_with_errors.ksh \ functional/removal/removal_with_export.ksh \ functional/removal/removal_with_faulted.ksh \ functional/removal/removal_with_ganging.ksh \ functional/removal/removal_with_indirect.ksh \ functional/removal/removal_with_remove.ksh \ functional/removal/removal_with_scrub.ksh \ functional/removal/removal_with_send.ksh \ functional/removal/removal_with_send_recv.ksh \ functional/removal/removal_with_snapshot.ksh \ functional/removal/removal_with_write.ksh \ functional/removal/removal_with_zdb.ksh \ functional/removal/remove_attach_mirror.ksh \ functional/removal/remove_expanded.ksh \ functional/removal/remove_indirect.ksh \ functional/removal/remove_mirror.ksh \ functional/removal/remove_mirror_sanity.ksh \ functional/removal/remove_raidz.ksh \ functional/rename_dirs/cleanup.ksh \ functional/rename_dirs/rename_dirs_001_pos.ksh \ functional/rename_dirs/setup.ksh \ functional/renameat2/cleanup.ksh \ functional/renameat2/setup.ksh \ functional/renameat2/renameat2_exchange.ksh \ functional/renameat2/renameat2_noreplace.ksh \ functional/renameat2/renameat2_whiteout.ksh \ functional/replacement/attach_import.ksh \ functional/replacement/attach_multiple.ksh \ functional/replacement/attach_rebuild.ksh \ functional/replacement/attach_resilver.ksh \ functional/replacement/cleanup.ksh \ functional/replacement/detach.ksh \ functional/replacement/rebuild_disabled_feature.ksh \ functional/replacement/rebuild_multiple.ksh \ functional/replacement/rebuild_raidz.ksh \ functional/replacement/replace_import.ksh \ functional/replacement/replace_rebuild.ksh \ functional/replacement/replace_resilver.ksh \ functional/replacement/resilver_restart_001.ksh \ functional/replacement/resilver_restart_002.ksh \ functional/replacement/scrub_cancel.ksh \ functional/replacement/setup.ksh \ functional/reservation/cleanup.ksh \ functional/reservation/reservation_001_pos.ksh \ functional/reservation/reservation_002_pos.ksh \ functional/reservation/reservation_003_pos.ksh \ functional/reservation/reservation_004_pos.ksh \ functional/reservation/reservation_005_pos.ksh \ functional/reservation/reservation_006_pos.ksh \ functional/reservation/reservation_007_pos.ksh \ functional/reservation/reservation_008_pos.ksh \ functional/reservation/reservation_009_pos.ksh \ functional/reservation/reservation_010_pos.ksh \ functional/reservation/reservation_011_pos.ksh \ functional/reservation/reservation_012_pos.ksh \ functional/reservation/reservation_013_pos.ksh \ functional/reservation/reservation_014_pos.ksh \ functional/reservation/reservation_015_pos.ksh \ functional/reservation/reservation_016_pos.ksh \ functional/reservation/reservation_017_pos.ksh \ functional/reservation/reservation_018_pos.ksh \ functional/reservation/reservation_019_pos.ksh \ functional/reservation/reservation_020_pos.ksh \ functional/reservation/reservation_021_neg.ksh \ functional/reservation/reservation_022_pos.ksh \ functional/reservation/setup.ksh \ functional/rootpool/cleanup.ksh \ functional/rootpool/rootpool_002_neg.ksh \ functional/rootpool/rootpool_003_neg.ksh \ functional/rootpool/rootpool_007_pos.ksh \ functional/rootpool/setup.ksh \ functional/rsend/cleanup.ksh \ functional/rsend/recv_dedup_encrypted_zvol.ksh \ functional/rsend/recv_dedup.ksh \ functional/rsend/rsend_001_pos.ksh \ functional/rsend/rsend_002_pos.ksh \ functional/rsend/rsend_003_pos.ksh \ functional/rsend/rsend_004_pos.ksh \ functional/rsend/rsend_005_pos.ksh \ functional/rsend/rsend_006_pos.ksh \ functional/rsend/rsend_007_pos.ksh \ functional/rsend/rsend_008_pos.ksh \ functional/rsend/rsend_009_pos.ksh \ functional/rsend/rsend_010_pos.ksh \ functional/rsend/rsend_011_pos.ksh \ functional/rsend/rsend_012_pos.ksh \ functional/rsend/rsend_013_pos.ksh \ functional/rsend/rsend_014_pos.ksh \ functional/rsend/rsend_016_neg.ksh \ functional/rsend/rsend_019_pos.ksh \ functional/rsend/rsend_020_pos.ksh \ functional/rsend/rsend_021_pos.ksh \ functional/rsend/rsend_022_pos.ksh \ functional/rsend/rsend_024_pos.ksh \ functional/rsend/rsend_025_pos.ksh \ functional/rsend/rsend_026_neg.ksh \ functional/rsend/rsend_027_pos.ksh \ functional/rsend/rsend_028_neg.ksh \ functional/rsend/rsend_029_neg.ksh \ functional/rsend/rsend_030_pos.ksh \ functional/rsend/rsend_031_pos.ksh \ functional/rsend/send-c_embedded_blocks.ksh \ functional/rsend/send-c_incremental.ksh \ functional/rsend/send-c_lz4_disabled.ksh \ functional/rsend/send-c_mixed_compression.ksh \ functional/rsend/send-c_props.ksh \ functional/rsend/send-c_recv_dedup.ksh \ functional/rsend/send-c_recv_lz4_disabled.ksh \ functional/rsend/send-c_resume.ksh \ functional/rsend/send-c_stream_size_estimate.ksh \ functional/rsend/send-c_verify_contents.ksh \ functional/rsend/send-c_verify_ratio.ksh \ functional/rsend/send-c_volume.ksh \ functional/rsend/send-c_zstream_recompress.ksh \ functional/rsend/send-c_zstreamdump.ksh \ functional/rsend/send-cpL_varied_recsize.ksh \ functional/rsend/send_doall.ksh \ functional/rsend/send_encrypted_incremental.ksh \ functional/rsend/send_encrypted_files.ksh \ functional/rsend/send_encrypted_freeobjects.ksh \ functional/rsend/send_encrypted_hierarchy.ksh \ functional/rsend/send_encrypted_props.ksh \ functional/rsend/send_encrypted_truncated_files.ksh \ functional/rsend/send_freeobjects.ksh \ functional/rsend/send_holds.ksh \ functional/rsend/send_hole_birth.ksh \ functional/rsend/send_invalid.ksh \ functional/rsend/send-L_toggle.ksh \ functional/rsend/send_mixed_raw.ksh \ functional/rsend/send_partial_dataset.ksh \ functional/rsend/send_raw_ashift.ksh \ functional/rsend/send_raw_spill_block.ksh \ functional/rsend/send_raw_large_blocks.ksh \ functional/rsend/send_realloc_dnode_size.ksh \ functional/rsend/send_realloc_encrypted_files.ksh \ functional/rsend/send_realloc_files.ksh \ functional/rsend/send_spill_block.ksh \ functional/rsend/send-wR_encrypted_zvol.ksh \ functional/rsend/setup.ksh \ functional/scrub_mirror/cleanup.ksh \ functional/scrub_mirror/scrub_mirror_001_pos.ksh \ functional/scrub_mirror/scrub_mirror_002_pos.ksh \ functional/scrub_mirror/scrub_mirror_003_pos.ksh \ functional/scrub_mirror/scrub_mirror_004_pos.ksh \ functional/scrub_mirror/setup.ksh \ functional/slog/cleanup.ksh \ functional/slog/setup.ksh \ functional/slog/slog_001_pos.ksh \ functional/slog/slog_002_pos.ksh \ functional/slog/slog_003_pos.ksh \ functional/slog/slog_004_pos.ksh \ functional/slog/slog_005_pos.ksh \ functional/slog/slog_006_pos.ksh \ functional/slog/slog_007_pos.ksh \ functional/slog/slog_008_neg.ksh \ functional/slog/slog_009_neg.ksh \ functional/slog/slog_010_neg.ksh \ functional/slog/slog_011_neg.ksh \ functional/slog/slog_012_neg.ksh \ functional/slog/slog_013_pos.ksh \ functional/slog/slog_014_pos.ksh \ functional/slog/slog_015_neg.ksh \ functional/slog/slog_016_pos.ksh \ functional/slog/slog_replay_fs_001.ksh \ functional/slog/slog_replay_fs_002.ksh \ functional/slog/slog_replay_volume.ksh \ functional/snapshot/cleanup.ksh \ functional/snapshot/clone_001_pos.ksh \ functional/snapshot/rollback_001_pos.ksh \ functional/snapshot/rollback_002_pos.ksh \ functional/snapshot/rollback_003_pos.ksh \ functional/snapshot/setup.ksh \ functional/snapshot/snapshot_001_pos.ksh \ functional/snapshot/snapshot_002_pos.ksh \ functional/snapshot/snapshot_003_pos.ksh \ functional/snapshot/snapshot_004_pos.ksh \ functional/snapshot/snapshot_005_pos.ksh \ functional/snapshot/snapshot_006_pos.ksh \ functional/snapshot/snapshot_007_pos.ksh \ functional/snapshot/snapshot_008_pos.ksh \ functional/snapshot/snapshot_009_pos.ksh \ functional/snapshot/snapshot_010_pos.ksh \ functional/snapshot/snapshot_011_pos.ksh \ functional/snapshot/snapshot_012_pos.ksh \ functional/snapshot/snapshot_013_pos.ksh \ functional/snapshot/snapshot_014_pos.ksh \ functional/snapshot/snapshot_015_pos.ksh \ functional/snapshot/snapshot_016_pos.ksh \ functional/snapshot/snapshot_017_pos.ksh \ functional/snapshot/snapshot_018_pos.ksh \ functional/snapused/cleanup.ksh \ functional/snapused/setup.ksh \ functional/snapused/snapused_001_pos.ksh \ functional/snapused/snapused_002_pos.ksh \ functional/snapused/snapused_003_pos.ksh \ functional/snapused/snapused_004_pos.ksh \ functional/snapused/snapused_005_pos.ksh \ functional/sparse/cleanup.ksh \ functional/sparse/setup.ksh \ functional/sparse/sparse_001_pos.ksh \ functional/stat/cleanup.ksh \ functional/stat/setup.ksh \ functional/stat/stat_001_pos.ksh \ functional/suid/cleanup.ksh \ functional/suid/setup.ksh \ functional/suid/suid_write_to_none.ksh \ functional/suid/suid_write_to_sgid.ksh \ functional/suid/suid_write_to_suid.ksh \ functional/suid/suid_write_to_suid_sgid.ksh \ functional/suid/suid_write_zil_replay.ksh \ functional/trim/autotrim_config.ksh \ functional/trim/autotrim_integrity.ksh \ functional/trim/autotrim_trim_integrity.ksh \ functional/trim/cleanup.ksh \ functional/trim/setup.ksh \ functional/trim/trim_config.ksh \ functional/trim/trim_integrity.ksh \ functional/trim/trim_l2arc.ksh \ functional/truncate/cleanup.ksh \ functional/truncate/setup.ksh \ functional/truncate/truncate_001_pos.ksh \ functional/truncate/truncate_002_pos.ksh \ functional/truncate/truncate_timestamps.ksh \ functional/upgrade/cleanup.ksh \ functional/upgrade/setup.ksh \ functional/upgrade/upgrade_projectquota_001_pos.ksh \ functional/upgrade/upgrade_readonly_pool.ksh \ functional/upgrade/upgrade_userobj_001_pos.ksh \ functional/user_namespace/cleanup.ksh \ functional/user_namespace/setup.ksh \ functional/user_namespace/user_namespace_001.ksh \ functional/user_namespace/user_namespace_002.ksh \ functional/user_namespace/user_namespace_003.ksh \ functional/user_namespace/user_namespace_004.ksh \ functional/userquota/cleanup.ksh \ functional/userquota/groupspace_001_pos.ksh \ functional/userquota/groupspace_002_pos.ksh \ functional/userquota/groupspace_003_pos.ksh \ functional/userquota/setup.ksh \ functional/userquota/userquota_001_pos.ksh \ functional/userquota/userquota_002_pos.ksh \ functional/userquota/userquota_003_pos.ksh \ functional/userquota/userquota_004_pos.ksh \ functional/userquota/userquota_005_neg.ksh \ functional/userquota/userquota_006_pos.ksh \ functional/userquota/userquota_007_pos.ksh \ functional/userquota/userquota_008_pos.ksh \ functional/userquota/userquota_009_pos.ksh \ functional/userquota/userquota_010_pos.ksh \ functional/userquota/userquota_011_pos.ksh \ functional/userquota/userquota_012_neg.ksh \ functional/userquota/userquota_013_pos.ksh \ functional/userquota/userspace_001_pos.ksh \ functional/userquota/userspace_002_pos.ksh \ functional/userquota/userspace_003_pos.ksh \ functional/userquota/userspace_encrypted.ksh \ functional/userquota/userspace_send_encrypted.ksh \ functional/userquota/userspace_encrypted_13709.ksh \ functional/vdev_zaps/cleanup.ksh \ functional/vdev_zaps/setup.ksh \ functional/vdev_zaps/vdev_zaps_001_pos.ksh \ functional/vdev_zaps/vdev_zaps_002_pos.ksh \ functional/vdev_zaps/vdev_zaps_003_pos.ksh \ functional/vdev_zaps/vdev_zaps_004_pos.ksh \ functional/vdev_zaps/vdev_zaps_005_pos.ksh \ functional/vdev_zaps/vdev_zaps_006_pos.ksh \ functional/vdev_zaps/vdev_zaps_007_pos.ksh \ functional/write_dirs/cleanup.ksh \ functional/write_dirs/setup.ksh \ functional/write_dirs/write_dirs_001_pos.ksh \ functional/write_dirs/write_dirs_002_pos.ksh \ functional/xattr/cleanup.ksh \ functional/xattr/setup.ksh \ functional/xattr/xattr_001_pos.ksh \ functional/xattr/xattr_002_neg.ksh \ functional/xattr/xattr_003_neg.ksh \ functional/xattr/xattr_004_pos.ksh \ functional/xattr/xattr_005_pos.ksh \ functional/xattr/xattr_006_pos.ksh \ functional/xattr/xattr_007_neg.ksh \ functional/xattr/xattr_008_pos.ksh \ functional/xattr/xattr_009_neg.ksh \ functional/xattr/xattr_010_neg.ksh \ functional/xattr/xattr_011_pos.ksh \ functional/xattr/xattr_012_pos.ksh \ functional/xattr/xattr_013_pos.ksh \ functional/xattr/xattr_compat.ksh \ functional/zpool_influxdb/cleanup.ksh \ functional/zpool_influxdb/setup.ksh \ functional/zpool_influxdb/zpool_influxdb.ksh \ functional/zvol/zvol_cli/cleanup.ksh \ functional/zvol/zvol_cli/setup.ksh \ functional/zvol/zvol_cli/zvol_cli_001_pos.ksh \ functional/zvol/zvol_cli/zvol_cli_002_pos.ksh \ functional/zvol/zvol_cli/zvol_cli_003_neg.ksh \ functional/zvol/zvol_ENOSPC/cleanup.ksh \ functional/zvol/zvol_ENOSPC/setup.ksh \ functional/zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos.ksh \ functional/zvol/zvol_misc/cleanup.ksh \ functional/zvol/zvol_misc/setup.ksh \ functional/zvol/zvol_misc/zvol_misc_001_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_002_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_003_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_004_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_005_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_006_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_fua.ksh \ functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh \ functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh \ functional/zvol/zvol_misc/zvol_misc_snapdev.ksh \ functional/zvol/zvol_misc/zvol_misc_trim.ksh \ functional/zvol/zvol_misc/zvol_misc_volmode.ksh \ functional/zvol/zvol_misc/zvol_misc_zil.ksh \ functional/zvol/zvol_stress/cleanup.ksh \ functional/zvol/zvol_stress/setup.ksh \ functional/zvol/zvol_stress/zvol_stress.ksh \ functional/zvol/zvol_swap/cleanup.ksh \ functional/zvol/zvol_swap/setup.ksh \ functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_002_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_003_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_004_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_005_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_006_pos.ksh \ functional/idmap_mount/cleanup.ksh \ functional/idmap_mount/setup.ksh \ functional/idmap_mount/idmap_mount_001.ksh \ functional/idmap_mount/idmap_mount_002.ksh \ functional/idmap_mount/idmap_mount_003.ksh \ functional/idmap_mount/idmap_mount_004.ksh \ functional/idmap_mount/idmap_mount_005.ksh diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh index 7ac13adb6325..b985445a5d12 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/cleanup.ksh @@ -1,34 +1,38 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2023, Klara Inc. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib verify_runnable "global" default_cleanup_noexit +if tunable_exists BCLONE_ENABLED ; then + log_must restore_tunable BCLONE_ENABLED +fi + log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/setup.ksh index 512f5a0644df..58441bf8f3ad 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/setup.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/setup.ksh @@ -1,36 +1,41 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2023, Klara Inc. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib if ! command -v clonefile > /dev/null ; then log_unsupported "clonefile program required to test block cloning" fi verify_runnable "global" +if tunable_exists BCLONE_ENABLED ; then + log_must save_tunable BCLONE_ENABLED + log_must set_tunable32 BCLONE_ENABLED 1 +fi + log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh index c35ca8e8c92c..c7c133a219cd 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh @@ -1,54 +1,58 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2021 Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib # # DESCRIPTION: # Verify pools can be created with the expected feature set enabled. # # STRATEGY: # 1. Create a pool with a known feature set. # 2. Verify only those features are active/enabled. +# 3. Do this for all known feature sets # verify_runnable "global" function cleanup { datasetexists $TESTPOOL && log_must zpool destroy $TESTPOOL } log_onexit cleanup log_assert "creates a pool with a specified feature set enabled" -log_must zpool create -f -o compatibility=compat-2020 $TESTPOOL $DISKS -check_feature_set $TESTPOOL compat-2020 -log_must zpool destroy -f $TESTPOOL +for compat in "$ZPOOL_COMPAT_DIR"/* +do + log_must zpool create -f -o compatibility="${compat##*/}" $TESTPOOL $DISKS + check_feature_set $TESTPOOL "${compat##*/}" + log_must zpool destroy -f $TESTPOOL +done log_pass "creates a pool with a specified feature set enabled" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg index e98b5e8b2214..9c76a8780b4a 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg @@ -1,123 +1,123 @@ # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # # Copyright (c) 2013 by Delphix. All rights reserved. # if is_linux; then - # these are the set of setable ZFS properties + # these are the set of settable ZFS properties PROP_NAMES="\ acltype atime \ checksum compression devices \ exec mountpoint quota readonly \ recordsize reservation setuid \ snapdir" # these are a set of values we apply, for use when testing the # zfs get/set subcommands - ordered as per the list above so we # can iterate over both sets in an array PROP_VALS="\ posix on \ fletcher2 on on \ on legacy none on \ 128K none on \ visible" # these are an alternate set of property values PROP_ALTVALS="\ nfsv4 off \ fletcher4 lzjb off \ off /tmp/zfstest 100M off \ 512 10m off \ hidden" elif is_freebsd; then PROP_NAMES="\ acltype atime \ checksum compression devices \ exec mountpoint quota readonly \ recordsize reservation setuid \ snapdir" # these are a set of values we apply, for use when testing the # zfs get/set subcommands - ordered as per the list above so we # can iterate over both sets in an array PROP_VALS="\ posix on \ fletcher2 on on \ on legacy none on \ 128K none on \ visible" # these are an alternate set of property values PROP_ALTVALS="\ nfsv4 off \ fletcher4 lzjb off \ off /tmp/zfstest 100M off \ 512 10m off \ hidden" else - # these are the set of setable ZFS properties + # these are the set of settable ZFS properties PROP_NAMES="\ aclinherit aclmode atime \ checksum compression devices \ exec mountpoint quota readonly \ recordsize reservation setuid sharenfs \ snapdir" # these are a set of values we apply, for use when testing the # zfs get/set subcommands - ordered as per the list above so we # can iterate over both sets in an array PROP_VALS="\ passthrough discard on \ fletcher2 on on \ on legacy none on \ 128K none on on \ visible" # these are an alternate set of property values PROP_ALTVALS="\ passthrough-x groupmask off \ fletcher4 lzjb off \ off /tmp/zfstest 100M off \ 512 10m off off \ hidden" fi # additional properties to worry about: canmount copies xattr zoned version POOL_PROPS="\ failmode autoreplace" POOL_VALS="\ continue on" POOL_ALTVALS="\ panic off" export TESTSNAP=testsnap-misc export TESTCLCT=testclct-misc diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh index 081e6c18430d..ae56ee9919bf 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh @@ -1,109 +1,140 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2017 by Intel Corporation. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/fault/fault.cfg # # DESCRIPTION: # Testing Fault Management Agent ZED Logic - Automated Auto-Replace Test. # # STRATEGY: # 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path. # This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug". # 2. Create a pool and set autoreplace=on (auto-replace is opt-in) -# 3. Export a pool +# 3. Export the pool # 4. Wipe and offline the scsi_debug disk -# 5. Import pool with missing disk +# 5. Import the pool with missing disk # 6. Re-online the wiped scsi_debug disk -# 7. Verify the ZED detects the new unused disk and adds it back to the pool +# 7. Verify ZED detects the new blank disk and replaces the missing vdev +# 8. Verify that the scsi_debug disk was re-partitioned # -# Creates a raidz1 zpool using persistent disk path names +# Creates a raidz1 zpool using persistent /dev/disk/by-vdev path names # (ie not /dev/sdc) # # Auto-replace is opt in, and matches by phys_path. # verify_runnable "both" if ! is_physical_device $DISKS; then log_unsupported "Unsupported disks for this test." fi function cleanup { zpool status $TESTPOOL destroy_pool $TESTPOOL sed -i '/alias scsidebug/d' $VDEVID_CONF unload_scsi_debug } log_assert "Testing automated auto-replace FMA test" log_onexit cleanup load_scsi_debug $SDSIZE $SDHOSTS $SDTGTS $SDLUNS '512b' SD=$(get_debug_device) SD_DEVICE_ID=$(get_persistent_disk_name $SD) SD_HOST=$(get_scsi_host $SD) # Register vdev_id alias for scsi_debug device to create a persistent path echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF block_device_wait SD_DEVICE=$(udevadm info -q all -n $DEV_DSKDIR/$SD | \ awk -F'=' '/ID_VDEV=/ {print $2; exit}') [ -z $SD_DEVICE ] && log_fail "vdev rule was not registered properly" log_must zpool events -c log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE $DISK1 $DISK2 $DISK3 # Auto-replace is opt-in so need to set property log_must zpool set autoreplace=on $TESTPOOL # Add some data to the pool -log_must mkfile $FSIZE /$TESTPOOL/data +log_must zfs create $TESTPOOL/fs +log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z log_must zpool export $TESTPOOL +# Record the partition UUID for later comparison +part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) +[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid} + +# # Wipe and offline the disk +# +# Note that it is not enough to zero the disk to expunge the partitions. +# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe'). +# +# Using partprobe is overkill and hdparm is not as common as wipefs. So +# we use wipefs which lets the kernel know the partition was removed +# from the device (i.e., calls BLKRRPART ioctl). +# log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE +log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID remove_disk $SD block_device_wait # Re-import pool with drive missing log_must zpool import $TESTPOOL log_must check_state $TESTPOOL "" "DEGRADED" block_device_wait # Online an empty disk in the same physical location insert_disk $SD $SD_HOST # Wait for the new disk to be online and replaced log_must wait_vdev_state $TESTPOOL "scsidebug" "ONLINE" 60 log_must wait_replacing $TESTPOOL 60 # Validate auto-replace was successful log_must check_state $TESTPOOL "" "ONLINE" +# +# Confirm the partition UUID changed so we know the new disk was relabeled +# +# Note: some older versions of udevadm don't support "--property" option so +# we'll # skip this test when it is not supported +# +if [ ! -z "$part_uuid" ]; then + new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) + log_note new disk GPT uuid ${new_uuid} + [[ "$part_uuid" = "$new_uuid" ]] && \ + log_fail "The new disk was not relabeled as expected" +fi + log_pass "Auto-replace test successful" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh new file mode 100755 index 000000000000..2259e604317b --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh @@ -0,0 +1,192 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2017 by Intel Corporation. All rights reserved. +# Copyright (c) 2023 by Klara, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: +# Testing Fault Management Agent ZED Logic - Automated Auto-Replace Test. +# Verifys that auto-replace works with by-id paths. +# +# STRATEGY: +# 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path. +# This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug". +# 2. Create a pool and set autoreplace=on (auto-replace is opt-in) +# 3. Export the pool +# 4. Wipe and offline the scsi_debug disk +# 5. Import the pool with missing disk +# 6. Re-online the wiped scsi_debug disk with a new serial number +# 7. Verify ZED detects the new blank disk and replaces the missing vdev +# 8. Verify that the scsi_debug disk was re-partitioned +# +# Creates a raidz1 zpool using persistent /dev/disk/by-id path names +# +# Auto-replace is opt in, and matches by phys_path. +# + +verify_runnable "both" + +if ! is_physical_device $DISKS; then + log_unsupported "Unsupported disks for this test." +fi + +function cleanup +{ + zpool status $TESTPOOL + destroy_pool $TESTPOOL + sed -i '/alias scsidebug/d' $VDEVID_CONF + unload_scsi_debug +} + +# +# Wait until a vdev transitions to its replacement vdev +# +# Return 0 when vdev reaches expected state, 1 on timeout. +# +# Note: index +2 is to skip over root and raidz-0 vdevs +# +function wait_vdev_online # pool index oldguid timeout +{ + typeset pool=$1 + typeset -i index=$2+2 + typeset guid=$3 + typeset timeout=${4:-60} + typeset -i i=0 + + while [[ $i -lt $timeout ]]; do + vdev_guids=( $(zpool get -H -o value guid $pool all-vdevs) ) + + if [ "${vdev_guids[$index]}" != "${guid}" ]; then + log_note "new vdev[$((index-2))]: ${vdev_guids[$index]}, replacing ${guid}" + return 0 + fi + + i=$((i+1)) + sleep 1 + done + + return 1 +} +log_assert "automated auto-replace with by-id paths" +log_onexit cleanup + +load_scsi_debug $SDSIZE $SDHOSTS $SDTGTS $SDLUNS '512b' +SD=$(get_debug_device) +SD_DEVICE_ID=$(get_persistent_disk_name $SD) +SD_HOST=$(get_scsi_host $SD) + +# Register vdev_id alias for scsi_debug device to create a persistent path +echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF +block_device_wait + +SD_DEVICE=$(udevadm info -q all -n $DEV_DSKDIR/$SD | \ + awk -F'=' '/ID_VDEV=/ {print $2; exit}') +[ -z $SD_DEVICE ] && log_fail "vdev rule was not registered properly" + +log_must zpool events -c +log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE_ID $DISK1 $DISK2 $DISK3 + +vdev_guid=$(zpool get guid -H -o value $TESTPOOL $SD_DEVICE_ID) +log_note original vdev guid ${vdev_guid} + +# Auto-replace is opt-in so need to set property +log_must zpool set autoreplace=on $TESTPOOL + +# Add some data to the pool +log_must zfs create $TESTPOOL/fs +log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z +log_must zpool export $TESTPOOL + +# Record the partition UUID for later comparison +part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) +[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid} + +# +# Wipe and offline the disk +# +# Note that it is not enough to zero the disk to expunge the partitions. +# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe'). +# +# Using partprobe is overkill and hdparm is not as common as wipefs. So +# we use wipefs which lets the kernel know the partition was removed +# from the device (i.e., calls BLKRRPART ioctl). +# +log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE +log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID +remove_disk $SD +block_device_wait + +# Re-import pool with drive missing +log_must zpool import $TESTPOOL +log_must check_state $TESTPOOL "" "DEGRADED" +block_device_wait + +# +# Online an empty disk in the same physical location, with a different by-id +# symlink. We use vpd_use_hostno to make sure the underlying serial number +# changes for the new disk which in turn gives us a different by-id path. +# +# The original names were something like: +# /dev/disk/by-id/scsi-SLinux_scsi_debug_16000-part1 +# /dev/disk/by-id/wwn-0x33333330000007d0-part1 +# +# This new inserted disk, will have different links like: +# /dev/disk/by-id/scsi-SLinux_scsi_debug_2000-part1 +# /dev/disk/by-id/wwn-0x0x3333333000003e80 -part1 +# +echo '0' > /sys/bus/pseudo/drivers/scsi_debug/vpd_use_hostno + +insert_disk $SD $SD_HOST + +# make sure the physical path points to the same scsi-debug device +SD_DEVICE_ID=$(get_persistent_disk_name $SD) +echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF +block_device_wait + +# Wait for the new disk to be online and replaced +log_must wait_vdev_online $TESTPOOL 0 $vdev_guid 45 +log_must wait_replacing $TESTPOOL 45 + +# Validate auto-replace was successful +log_must check_state $TESTPOOL "" "ONLINE" + +# +# Confirm the partition UUID changed so we know the new disk was relabeled +# +# Note: some older versions of udevadm don't support "--property" option so +# we'll # skip this test when it is not supported +# +if [ ! -z "$part_uuid" ]; then + new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \ + --value /dev/disk/by-id/$SD_DEVICE_ID) + log_note new disk GPT uuid ${new_uuid} + [[ "$part_uuid" = "$new_uuid" ]] && \ + log_fail "The new disk was not relabeled as expected" +fi + +log_pass "automated auto-replace with by-id paths" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/procfs/pool_state.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/procfs/pool_state.ksh index 7a02eb68abda..bae876379177 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/procfs/pool_state.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/procfs/pool_state.ksh @@ -1,148 +1,152 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # Copyright (c) 2018 by Lawrence Livermore National Security, LLC. # # # DESCRIPTION: # Test /proc/spl/kstat/zfs//state kstat # # STRATEGY: # 1. Create a mirrored pool # 2. Check that pool is ONLINE # 3. Fault one disk # 4. Check that pool is DEGRADED # 5. Create a new pool with a single scsi_debug disk # 6. Remove the disk # 7. Check that pool is SUSPENDED # 8. Add the disk back in # 9. Clear errors and destroy the pools . $STF_SUITE/include/libtest.shlib verify_runnable "both" function cleanup { # Destroy the scsi_debug pool if [ -n "$TESTPOOL2" ] ; then if [ -n "$host" ] ; then # Re-enable the disk scan_scsi_hosts $host # Device may have changed names after being inserted SDISK=$(get_debug_device) log_must ln $DEV_RDSKDIR/$SDISK $REALDISK fi # Restore our working pool image if [ -n "$BACKUP" ] ; then gunzip -c $BACKUP > $REALDISK log_must rm -f $BACKUP fi if poolexists $TESTPOOL2 ; then # Our disk is back. Now we can clear errors and destroy the # pool cleanly. log_must zpool clear $TESTPOOL2 # Now that the disk is back and errors cleared, wait for our # hung 'zpool scrub' to finish. wait destroy_pool $TESTPOOL2 fi log_must rm -f $REALDISK unload_scsi_debug fi } # Check that our pool state values match what's expected # # $1: pool name # $2: expected state ("ONLINE", "DEGRADED", "SUSPENDED", etc) function check_all { pool=$1 expected=$2 state1=$(zpool status $pool | awk '/state: /{print $2}'); state2=$(zpool list -H -o health $pool) state3=$(/state kstat" # Test that the initial pool is healthy check_all $TESTPOOL "ONLINE" # Fault one of the disks, and check that pool is degraded read -r DISK1 _ <<<"$DISKS" log_must zpool offline -tf $TESTPOOL $DISK1 check_all $TESTPOOL "DEGRADED" log_must zpool online $TESTPOOL $DISK1 log_must zpool clear $TESTPOOL # Create a new pool out of a scsi_debug disk TESTPOOL2=testpool2 MINVDEVSIZE_MB=$((MINVDEVSIZE / 1048576)) load_scsi_debug $MINVDEVSIZE_MB 1 1 1 '512b' SDISK=$(get_debug_device) host=$(get_scsi_host $SDISK) # Use $REALDISK instead of $SDISK in our pool because $SDISK can change names # as we remove/add the disk (i.e. /dev/sdf -> /dev/sdg). REALDISK=/dev/kstat-state-realdisk log_must [ ! -e $REALDISK ] ln $DEV_RDSKDIR/$SDISK $REALDISK log_must zpool create $TESTPOOL2 $REALDISK # Backup the contents of the disk image BACKUP=$TEST_BASE_DIR/kstat-state-realdisk.gz log_must [ ! -e $BACKUP ] gzip -c $REALDISK > $BACKUP # Yank out the disk from under the pool log_must rm $REALDISK remove_disk $SDISK # Run a 'zpool scrub' in the background to suspend the pool. We run it in the # background since the command will hang when the pool gets suspended. The # command will resume and exit after we restore the missing disk later on. zpool scrub $TESTPOOL2 & -sleep 3 # Give the scrub some time to run before we check if it fails +# Once we trigger the zpool scrub, all zpool/zfs command gets stuck for 180 seconds. +# Post 180 seconds zpool/zfs commands gets start executing however few more seconds(10s) +# it take to update the status. +# hence sleeping for 200 seconds so that we get the correct status. +sleep 200 # Give the scrub some time to run before we check if it fails log_must check_all $TESTPOOL2 "SUSPENDED" log_pass "/proc/spl/kstat/zfs//state test successful" diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index 9c423a3e3e36..02f5116f3eb3 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -1,1134 +1,1149 @@ /* */ /* zfs_config.h. Generated from zfs_config.h.in by configure. */ /* zfs_config.h.in. Generated from configure.ac by autoheader. */ /* Define to 1 if translation of program messages to the user's native language is requested. */ /* #undef ENABLE_NLS */ /* bio_end_io_t wants 1 arg */ /* #undef HAVE_1ARG_BIO_END_IO_T */ /* lookup_bdev() wants 1 arg */ /* #undef HAVE_1ARG_LOOKUP_BDEV */ /* submit_bio() wants 1 arg */ /* #undef HAVE_1ARG_SUBMIT_BIO */ /* bdi_setup_and_register() wants 2 args */ /* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 2 args */ /* #undef HAVE_2ARGS_VFS_GETATTR */ /* zlib_deflate_workspacesize() wants 2 args */ /* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */ /* bdi_setup_and_register() wants 3 args */ /* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 3 args */ /* #undef HAVE_3ARGS_VFS_GETATTR */ /* vfs_getattr wants 4 args */ /* #undef HAVE_4ARGS_VFS_GETATTR */ /* kernel has access_ok with 'type' parameter */ /* #undef HAVE_ACCESS_OK_TYPE */ /* posix_acl has refcount_t */ /* #undef HAVE_ACL_REFCOUNT */ /* add_disk() returns int */ /* #undef HAVE_ADD_DISK_RET */ /* Define if host toolchain supports AES */ #define HAVE_AES 1 /* Define if you have [rt] */ #define HAVE_AIO_H 1 #ifdef __amd64__ #ifndef RESCUE /* Define if host toolchain supports AVX */ #define HAVE_AVX 1 #endif /* Define if host toolchain supports AVX2 */ #define HAVE_AVX2 1 /* Define if host toolchain supports AVX512BW */ #define HAVE_AVX512BW 1 /* Define if host toolchain supports AVX512CD */ #define HAVE_AVX512CD 1 /* Define if host toolchain supports AVX512DQ */ #define HAVE_AVX512DQ 1 /* Define if host toolchain supports AVX512ER */ #define HAVE_AVX512ER 1 /* Define if host toolchain supports AVX512F */ #define HAVE_AVX512F 1 /* Define if host toolchain supports AVX512IFMA */ #define HAVE_AVX512IFMA 1 /* Define if host toolchain supports AVX512PF */ #define HAVE_AVX512PF 1 /* Define if host toolchain supports AVX512VBMI */ #define HAVE_AVX512VBMI 1 /* Define if host toolchain supports AVX512VL */ #define HAVE_AVX512VL 1 #endif /* bdevname() is available */ /* #undef HAVE_BDEVNAME */ /* bdev_check_media_change() exists */ /* #undef HAVE_BDEV_CHECK_MEDIA_CHANGE */ /* bdev_*_io_acct() available */ /* #undef HAVE_BDEV_IO_ACCT_63 */ /* bdev_*_io_acct() available */ /* #undef HAVE_BDEV_IO_ACCT_OLD */ /* bdev_kobj() exists */ /* #undef HAVE_BDEV_KOBJ */ /* bdev_max_discard_sectors() is available */ /* #undef HAVE_BDEV_MAX_DISCARD_SECTORS */ /* bdev_max_secure_erase_sectors() is available */ /* #undef HAVE_BDEV_MAX_SECURE_ERASE_SECTORS */ /* block_device_operations->submit_bio() returns void */ /* #undef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID */ /* bdev_whole() is available */ /* #undef HAVE_BDEV_WHOLE */ /* bio_alloc() takes 4 arguments */ /* #undef HAVE_BIO_ALLOC_4ARG */ /* bio->bi_bdev->bd_disk exists */ /* #undef HAVE_BIO_BDEV_DISK */ /* bio->bi_opf is defined */ /* #undef HAVE_BIO_BI_OPF */ /* bio->bi_status exists */ /* #undef HAVE_BIO_BI_STATUS */ /* bio has bi_iter */ /* #undef HAVE_BIO_BVEC_ITER */ /* bio_*_io_acct() available */ /* #undef HAVE_BIO_IO_ACCT */ /* bio_max_segs() is implemented */ /* #undef HAVE_BIO_MAX_SEGS */ /* bio_set_dev() is available */ /* #undef HAVE_BIO_SET_DEV */ /* bio_set_dev() GPL-only */ /* #undef HAVE_BIO_SET_DEV_GPL_ONLY */ /* bio_set_dev() is a macro */ /* #undef HAVE_BIO_SET_DEV_MACRO */ /* bio_set_op_attrs is available */ /* #undef HAVE_BIO_SET_OP_ATTRS */ /* blkdev_get_by_path() exists and takes 4 args */ /* #undef HAVE_BLKDEV_GET_BY_PATH_4ARG */ /* blkdev_get_by_path() handles ERESTARTSYS */ /* #undef HAVE_BLKDEV_GET_ERESTARTSYS */ /* blkdev_issue_discard() is available */ /* #undef HAVE_BLKDEV_ISSUE_DISCARD */ /* blkdev_issue_secure_erase() is available */ /* #undef HAVE_BLKDEV_ISSUE_SECURE_ERASE */ /* blkdev_put() accepts void* as arg 2 */ /* #undef HAVE_BLKDEV_PUT_HOLDER */ /* blkdev_reread_part() exists */ /* #undef HAVE_BLKDEV_REREAD_PART */ /* blkg_tryget() is available */ /* #undef HAVE_BLKG_TRYGET */ /* blkg_tryget() GPL-only */ /* #undef HAVE_BLKG_TRYGET_GPL_ONLY */ /* blk_alloc_disk() exists */ /* #undef HAVE_BLK_ALLOC_DISK */ /* blk_alloc_queue() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */ /* blk_alloc_queue_rh() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH */ /* blk_cleanup_disk() exists */ /* #undef HAVE_BLK_CLEANUP_DISK */ /* blk_mode_t is defined */ /* #undef HAVE_BLK_MODE_T */ /* block multiqueue is available */ /* #undef HAVE_BLK_MQ */ /* blk queue backing_dev_info is dynamic */ /* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */ /* blk_queue_discard() is available */ /* #undef HAVE_BLK_QUEUE_DISCARD */ /* blk_queue_flag_clear() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */ /* blk_queue_flag_set() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_SET */ /* blk_queue_flush() is available */ /* #undef HAVE_BLK_QUEUE_FLUSH */ /* blk_queue_flush() is GPL-only */ /* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */ /* blk_queue_secdiscard() is available */ /* #undef HAVE_BLK_QUEUE_SECDISCARD */ /* blk_queue_secure_erase() is available */ /* #undef HAVE_BLK_QUEUE_SECURE_ERASE */ /* blk_queue_update_readahead() exists */ /* #undef HAVE_BLK_QUEUE_UPDATE_READAHEAD */ /* blk_queue_write_cache() exists */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE */ /* blk_queue_write_cache() is GPL-only */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */ /* BLK_STS_RESV_CONFLICT is defined */ /* #undef HAVE_BLK_STS_RESV_CONFLICT */ /* Define if release() in block_device_operations takes 1 arg */ /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG */ /* Define if revalidate_disk() in block_device_operations */ /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYCURRENT */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */ /* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in the CoreFoundation framework. */ /* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */ /* check_disk_change() exists */ /* #undef HAVE_CHECK_DISK_CHANGE */ /* clear_inode() is available */ /* #undef HAVE_CLEAR_INODE */ /* dentry uses const struct dentry_operations */ /* #undef HAVE_CONST_DENTRY_OPERATIONS */ /* copy_from_iter() is available */ /* #undef HAVE_COPY_FROM_ITER */ /* copy_splice_read exists */ /* #undef HAVE_COPY_SPLICE_READ */ /* copy_to_iter() is available */ /* #undef HAVE_COPY_TO_ITER */ /* cpu_has_feature() is GPL-only */ /* #undef HAVE_CPU_HAS_FEATURE_GPL_ONLY */ /* yes */ /* #undef HAVE_CPU_HOTPLUG */ /* current_time() exists */ /* #undef HAVE_CURRENT_TIME */ /* Define if the GNU dcgettext() function is already present or preinstalled. */ /* #undef HAVE_DCGETTEXT */ /* DECLARE_EVENT_CLASS() is available */ /* #undef HAVE_DECLARE_EVENT_CLASS */ /* dentry aliases are in d_u member */ /* #undef HAVE_DENTRY_D_U_ALIASES */ /* dequeue_signal() takes 4 arguments */ /* #undef HAVE_DEQUEUE_SIGNAL_4ARG */ /* lookup_bdev() wants dev_t arg */ /* #undef HAVE_DEVT_LOOKUP_BDEV */ /* sops->dirty_inode() wants flags */ /* #undef HAVE_DIRTY_INODE_WITH_FLAGS */ /* disk_check_media_change() exists */ /* #undef HAVE_DISK_CHECK_MEDIA_CHANGE */ /* disk_*_io_acct() available */ /* #undef HAVE_DISK_IO_ACCT */ /* disk_update_readahead() exists */ /* #undef HAVE_DISK_UPDATE_READAHEAD */ /* Define to 1 if you have the header file. */ #define HAVE_DLFCN_H 1 /* d_make_root() is available */ /* #undef HAVE_D_MAKE_ROOT */ /* d_prune_aliases() is available */ /* #undef HAVE_D_PRUNE_ALIASES */ /* dops->d_revalidate() operation takes nameidata */ /* #undef HAVE_D_REVALIDATE_NAMEIDATA */ /* eops->encode_fh() wants child and parent inodes */ /* #undef HAVE_ENCODE_FH_WITH_INODE */ /* sops->evict_inode() exists */ /* #undef HAVE_EVICT_INODE */ /* FALLOC_FL_ZERO_RANGE is defined */ /* #undef HAVE_FALLOC_FL_ZERO_RANGE */ /* fault_in_iov_iter_readable() is available */ /* #undef HAVE_FAULT_IN_IOV_ITER_READABLE */ /* filemap_range_has_page() is available */ /* #undef HAVE_FILEMAP_RANGE_HAS_PAGE */ /* fops->aio_fsync() exists */ /* #undef HAVE_FILE_AIO_FSYNC */ /* file_dentry() is available */ /* #undef HAVE_FILE_DENTRY */ /* fops->fadvise() exists */ /* #undef HAVE_FILE_FADVISE */ /* file_inode() is available */ /* #undef HAVE_FILE_INODE */ /* flush_dcache_page() is GPL-only */ /* #undef HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY */ /* iops->follow_link() cookie */ /* #undef HAVE_FOLLOW_LINK_COOKIE */ /* iops->follow_link() nameidata */ /* #undef HAVE_FOLLOW_LINK_NAMEIDATA */ /* Define if compiler supports -Wformat-overflow */ /* #undef HAVE_FORMAT_OVERFLOW */ +/* fsync_bdev() is declared in include/blkdev.h */ +/* #undef HAVE_FSYNC_BDEV */ + /* fops->fsync() with range */ /* #undef HAVE_FSYNC_RANGE */ /* fops->fsync() without dentry */ /* #undef HAVE_FSYNC_WITHOUT_DENTRY */ /* yes */ /* #undef HAVE_GENERIC_FADVISE */ /* generic_fillattr requires struct mnt_idmap* */ /* #undef HAVE_GENERIC_FILLATTR_IDMAP */ +/* generic_fillattr requires struct mnt_idmap* and u32 request_mask */ +/* #undef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK */ + /* generic_fillattr requires struct user_namespace* */ /* #undef HAVE_GENERIC_FILLATTR_USERNS */ /* generic_*_io_acct() 3 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_3ARG */ /* generic_*_io_acct() 4 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_4ARG */ /* generic_readlink is global */ /* #undef HAVE_GENERIC_READLINK */ /* generic_setxattr() exists */ /* #undef HAVE_GENERIC_SETXATTR */ /* generic_write_checks() takes kiocb */ /* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */ /* Define if the GNU gettext() function is already present or preinstalled. */ /* #undef HAVE_GETTEXT */ /* iops->get_acl() exists */ /* #undef HAVE_GET_ACL */ /* iops->get_acl() takes rcu */ /* #undef HAVE_GET_ACL_RCU */ /* has iops->get_inode_acl() */ /* #undef HAVE_GET_INODE_ACL */ /* iops->get_link() cookie */ /* #undef HAVE_GET_LINK_COOKIE */ /* iops->get_link() delayed */ /* #undef HAVE_GET_LINK_DELAYED */ /* group_info->gid exists */ /* #undef HAVE_GROUP_INFO_GID */ /* has_capability() is available */ /* #undef HAVE_HAS_CAPABILITY */ /* iattr->ia_vfsuid and iattr->ia_vfsgid exist */ /* #undef HAVE_IATTR_VFSID */ /* Define if you have the iconv() function and it works. */ #define HAVE_ICONV 1 /* iops->getattr() takes struct mnt_idmap* */ /* #undef HAVE_IDMAP_IOPS_GETATTR */ /* iops->setattr() takes struct mnt_idmap* */ /* #undef HAVE_IDMAP_IOPS_SETATTR */ /* APIs for idmapped mount are present */ /* #undef HAVE_IDMAP_MNT_API */ /* Define if compiler supports -Wimplicit-fallthrough */ /* #undef HAVE_IMPLICIT_FALLTHROUGH */ /* Define if compiler supports -Winfinite-recursion */ /* #undef HAVE_INFINITE_RECURSION */ +/* inode_get_ctime() exists in linux/fs.h */ +/* #undef HAVE_INODE_GET_CTIME */ + /* yes */ /* #undef HAVE_INODE_LOCK_SHARED */ /* inode_owner_or_capable() exists */ /* #undef HAVE_INODE_OWNER_OR_CAPABLE */ /* inode_owner_or_capable() takes mnt_idmap */ /* #undef HAVE_INODE_OWNER_OR_CAPABLE_IDMAP */ /* inode_owner_or_capable() takes user_ns */ /* #undef HAVE_INODE_OWNER_OR_CAPABLE_USERNS */ +/* inode_set_ctime_to_ts() exists in linux/fs.h */ +/* #undef HAVE_INODE_SET_CTIME_TO_TS */ + /* inode_set_flags() exists */ /* #undef HAVE_INODE_SET_FLAGS */ /* inode_set_iversion() exists */ /* #undef HAVE_INODE_SET_IVERSION */ /* inode->i_*time's are timespec64 */ /* #undef HAVE_INODE_TIMESPEC64_TIMES */ /* timestamp_truncate() exists */ /* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */ /* Define to 1 if you have the header file. */ #define HAVE_INTTYPES_H 1 /* in_compat_syscall() is available */ /* #undef HAVE_IN_COMPAT_SYSCALL */ /* iops->create() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_CREATE_IDMAP */ /* iops->create() takes struct user_namespace* */ /* #undef HAVE_IOPS_CREATE_USERNS */ /* iops->mkdir() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_MKDIR_IDMAP */ /* iops->mkdir() takes struct user_namespace* */ /* #undef HAVE_IOPS_MKDIR_USERNS */ /* iops->mknod() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_MKNOD_IDMAP */ /* iops->mknod() takes struct user_namespace* */ /* #undef HAVE_IOPS_MKNOD_USERNS */ /* iops->permission() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_PERMISSION_IDMAP */ /* iops->permission() takes struct user_namespace* */ /* #undef HAVE_IOPS_PERMISSION_USERNS */ /* iops->rename() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_RENAME_IDMAP */ /* iops->rename() takes struct user_namespace* */ /* #undef HAVE_IOPS_RENAME_USERNS */ /* iops->setattr() exists */ /* #undef HAVE_IOPS_SETATTR */ /* iops->symlink() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_SYMLINK_IDMAP */ /* iops->symlink() takes struct user_namespace* */ /* #undef HAVE_IOPS_SYMLINK_USERNS */ /* iov_iter_advance() is available */ /* #undef HAVE_IOV_ITER_ADVANCE */ /* iov_iter_count() is available */ /* #undef HAVE_IOV_ITER_COUNT */ /* iov_iter_fault_in_readable() is available */ /* #undef HAVE_IOV_ITER_FAULT_IN_READABLE */ /* iov_iter_revert() is available */ /* #undef HAVE_IOV_ITER_REVERT */ /* iov_iter_type() is available */ /* #undef HAVE_IOV_ITER_TYPE */ /* iov_iter types are available */ /* #undef HAVE_IOV_ITER_TYPES */ /* yes */ /* #undef HAVE_IO_SCHEDULE_TIMEOUT */ /* Define to 1 if you have the `issetugid' function. */ #define HAVE_ISSETUGID 1 /* iter_iov() is available */ /* #undef HAVE_ITER_IOV */ /* kernel has kernel_fpu_* functions */ /* #undef HAVE_KERNEL_FPU */ /* kernel has asm/fpu/api.h */ /* #undef HAVE_KERNEL_FPU_API_HEADER */ /* kernel fpu internal */ /* #undef HAVE_KERNEL_FPU_INTERNAL */ /* kernel has asm/fpu/internal.h */ /* #undef HAVE_KERNEL_FPU_INTERNAL_HEADER */ /* uncached_acl_sentinel() exists */ /* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */ /* Define if compiler supports -Winfinite-recursion */ /* #undef HAVE_KERNEL_INFINITE_RECURSION */ /* kernel does stack verification */ /* #undef HAVE_KERNEL_OBJTOOL */ /* kernel has linux/objtool.h */ /* #undef HAVE_KERNEL_OBJTOOL_HEADER */ /* kernel_read() take loff_t pointer */ /* #undef HAVE_KERNEL_READ_PPOS */ /* timer_list.function gets a timer_list */ /* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */ /* struct timer_list has a flags member */ /* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */ /* timer_setup() is available */ /* #undef HAVE_KERNEL_TIMER_SETUP */ /* kernel_write() take loff_t pointer */ /* #undef HAVE_KERNEL_WRITE_PPOS */ /* kmem_cache_create_usercopy() exists */ /* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */ /* kstrtoul() exists */ /* #undef HAVE_KSTRTOUL */ /* ktime_get_coarse_real_ts64() exists */ /* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */ /* ktime_get_raw_ts64() exists */ /* #undef HAVE_KTIME_GET_RAW_TS64 */ /* kvmalloc exists */ /* #undef HAVE_KVMALLOC */ /* Define if you have [aio] */ /* #undef HAVE_LIBAIO */ /* Define if you have [blkid] */ /* #undef HAVE_LIBBLKID */ /* Define if you have [crypto] */ #define HAVE_LIBCRYPTO 1 /* Define if you have [tirpc] */ /* #undef HAVE_LIBTIRPC */ /* Define if you have [udev] */ /* #undef HAVE_LIBUDEV */ /* Define if you have [uuid] */ /* #undef HAVE_LIBUUID */ /* linux/blk-cgroup.h exists */ /* #undef HAVE_LINUX_BLK_CGROUP_HEADER */ /* lseek_execute() is available */ /* #undef HAVE_LSEEK_EXECUTE */ /* makedev() is declared in sys/mkdev.h */ /* #undef HAVE_MAKEDEV_IN_MKDEV */ /* makedev() is declared in sys/sysmacros.h */ /* #undef HAVE_MAKEDEV_IN_SYSMACROS */ /* Noting that make_request_fn() returns blk_qc_t */ /* #undef HAVE_MAKE_REQUEST_FN_RET_QC */ /* Noting that make_request_fn() returns void */ /* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */ /* iops->mkdir() takes umode_t */ /* #undef HAVE_MKDIR_UMODE_T */ /* Define to 1 if you have the `mlockall' function. */ #define HAVE_MLOCKALL 1 /* lookup_bdev() wants mode arg */ /* #undef HAVE_MODE_LOOKUP_BDEV */ /* Define if host toolchain supports MOVBE */ #define HAVE_MOVBE 1 /* new_sync_read()/new_sync_write() are available */ /* #undef HAVE_NEW_SYNC_READ */ /* folio_wait_bit() exists */ /* #undef HAVE_PAGEMAP_FOLIO_WAIT_BIT */ /* part_to_dev() exists */ /* #undef HAVE_PART_TO_DEV */ /* iops->getattr() takes a path */ /* #undef HAVE_PATH_IOPS_GETATTR */ /* Define if host toolchain supports PCLMULQDQ */ #define HAVE_PCLMULQDQ 1 /* percpu_counter_add_batch() is defined */ /* #undef HAVE_PERCPU_COUNTER_ADD_BATCH */ /* percpu_counter_init() wants gfp_t */ /* #undef HAVE_PERCPU_COUNTER_INIT_WITH_GFP */ /* posix_acl_chmod() exists */ /* #undef HAVE_POSIX_ACL_CHMOD */ /* posix_acl_from_xattr() needs user_ns */ /* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */ /* posix_acl_release() is available */ /* #undef HAVE_POSIX_ACL_RELEASE */ /* posix_acl_release() is GPL-only */ /* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */ /* posix_acl_valid() wants user namespace */ /* #undef HAVE_POSIX_ACL_VALID_WITH_NS */ /* proc_ops structure exists */ /* #undef HAVE_PROC_OPS_STRUCT */ /* iops->put_link() cookie */ /* #undef HAVE_PUT_LINK_COOKIE */ /* iops->put_link() delayed */ /* #undef HAVE_PUT_LINK_DELAYED */ /* iops->put_link() nameidata */ /* #undef HAVE_PUT_LINK_NAMEIDATA */ /* If available, contains the Python version number currently in use. */ #define HAVE_PYTHON "3.7" /* qat is enabled and existed */ /* #undef HAVE_QAT */ /* struct reclaim_state has reclaimed */ /* #undef HAVE_RECLAIM_STATE_RECLAIMED */ /* register_shrinker is vararg */ /* #undef HAVE_REGISTER_SHRINKER_VARARG */ /* register_sysctl_table exists */ /* #undef HAVE_REGISTER_SYSCTL_TABLE */ /* iops->rename2() exists */ /* #undef HAVE_RENAME2 */ /* struct inode_operations_wrapper takes .rename2() */ /* #undef HAVE_RENAME2_OPERATIONS_WRAPPER */ /* iops->rename() wants flags */ /* #undef HAVE_RENAME_WANTS_FLAGS */ /* REQ_DISCARD is defined */ /* #undef HAVE_REQ_DISCARD */ /* REQ_FLUSH is defined */ /* #undef HAVE_REQ_FLUSH */ /* REQ_OP_DISCARD is defined */ /* #undef HAVE_REQ_OP_DISCARD */ /* REQ_OP_FLUSH is defined */ /* #undef HAVE_REQ_OP_FLUSH */ /* REQ_OP_SECURE_ERASE is defined */ /* #undef HAVE_REQ_OP_SECURE_ERASE */ /* REQ_PREFLUSH is defined */ /* #undef HAVE_REQ_PREFLUSH */ /* revalidate_disk() is available */ /* #undef HAVE_REVALIDATE_DISK */ /* revalidate_disk_size() is available */ /* #undef HAVE_REVALIDATE_DISK_SIZE */ /* struct rw_semaphore has member activity */ /* #undef HAVE_RWSEM_ACTIVITY */ /* struct rw_semaphore has atomic_long_t member count */ /* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */ /* linux/sched/signal.h exists */ /* #undef HAVE_SCHED_SIGNAL_HEADER */ /* Define to 1 if you have the header file. */ #define HAVE_SECURITY_PAM_MODULES_H 1 /* setattr_prepare() accepts mnt_idmap */ /* #undef HAVE_SETATTR_PREPARE_IDMAP */ /* setattr_prepare() is available, doesn't accept user_namespace */ /* #undef HAVE_SETATTR_PREPARE_NO_USERNS */ /* setattr_prepare() accepts user_namespace */ /* #undef HAVE_SETATTR_PREPARE_USERNS */ /* iops->set_acl() exists, takes 3 args */ /* #undef HAVE_SET_ACL */ /* iops->set_acl() takes 4 args, arg1 is struct mnt_idmap * */ /* #undef HAVE_SET_ACL_IDMAP_DENTRY */ /* iops->set_acl() takes 4 args */ /* #undef HAVE_SET_ACL_USERNS */ /* iops->set_acl() takes 4 args, arg2 is struct dentry * */ /* #undef HAVE_SET_ACL_USERNS_DENTRY_ARG2 */ /* set_cached_acl() is usable */ /* #undef HAVE_SET_CACHED_ACL_USABLE */ /* set_special_state() exists */ /* #undef HAVE_SET_SPECIAL_STATE */ /* struct shrink_control exists */ /* #undef HAVE_SHRINK_CONTROL_STRUCT */ /* kernel_siginfo_t exists */ /* #undef HAVE_SIGINFO */ /* signal_stop() exists */ /* #undef HAVE_SIGNAL_STOP */ /* new shrinker callback wants 2 args */ /* #undef HAVE_SINGLE_SHRINKER_CALLBACK */ /* cs->count_objects exists */ /* #undef HAVE_SPLIT_SHRINKER_CALLBACK */ #if defined(__amd64__) || defined(__i386__) /* Define if host toolchain supports SSE */ #define HAVE_SSE 1 /* Define if host toolchain supports SSE2 */ #define HAVE_SSE2 1 /* Define if host toolchain supports SSE3 */ #define HAVE_SSE3 1 /* Define if host toolchain supports SSE4.1 */ #define HAVE_SSE4_1 1 /* Define if host toolchain supports SSE4.2 */ #define HAVE_SSE4_2 1 /* Define if host toolchain supports SSSE3 */ #define HAVE_SSSE3 1 #endif /* STACK_FRAME_NON_STANDARD is defined */ /* #undef HAVE_STACK_FRAME_NON_STANDARD */ /* standalone exists */ /* #undef HAVE_STANDALONE_LINUX_STDARG */ /* Define to 1 if you have the header file. */ #define HAVE_STDINT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDIO_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDLIB_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRINGS_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRING_H 1 /* Define to 1 if you have the `strlcat' function. */ #define HAVE_STRLCAT 1 /* Define to 1 if you have the `strlcpy' function. */ #define HAVE_STRLCPY 1 /* submit_bio is member of struct block_device_operations */ /* #undef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ /* super_setup_bdi_name() exits */ /* #undef HAVE_SUPER_SETUP_BDI_NAME */ /* super_block->s_user_ns exists */ /* #undef HAVE_SUPER_USER_NS */ +/* sync_blockdev() is declared in include/blkdev.h */ +/* #undef HAVE_SYNC_BLOCKDEV */ + /* struct kobj_type has default_groups */ /* #undef HAVE_SYSFS_DEFAULT_GROUPS */ /* Define to 1 if you have the header file. */ #define HAVE_SYS_STAT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 /* i_op->tmpfile() exists */ /* #undef HAVE_TMPFILE */ /* i_op->tmpfile() uses old dentry signature */ /* #undef HAVE_TMPFILE_DENTRY */ /* i_op->tmpfile() has mnt_idmap */ /* #undef HAVE_TMPFILE_IDMAP */ /* i_op->tmpfile() has userns */ /* #undef HAVE_TMPFILE_USERNS */ /* totalhigh_pages() exists */ /* #undef HAVE_TOTALHIGH_PAGES */ /* kernel has totalram_pages() */ /* #undef HAVE_TOTALRAM_PAGES_FUNC */ /* Define to 1 if you have the `udev_device_get_is_initialized' function. */ /* #undef HAVE_UDEV_DEVICE_GET_IS_INITIALIZED */ /* kernel has __kernel_fpu_* functions */ /* #undef HAVE_UNDERSCORE_KERNEL_FPU */ /* Define to 1 if you have the header file. */ #define HAVE_UNISTD_H 1 /* iops->getattr() takes struct user_namespace* */ /* #undef HAVE_USERNS_IOPS_GETATTR */ /* iops->setattr() takes struct user_namespace* */ /* #undef HAVE_USERNS_IOPS_SETATTR */ /* user_namespace->ns.inum exists */ /* #undef HAVE_USER_NS_COMMON_INUM */ /* iops->getattr() takes a vfsmount */ /* #undef HAVE_VFSMOUNT_IOPS_GETATTR */ /* fops->clone_file_range() is available */ /* #undef HAVE_VFS_CLONE_FILE_RANGE */ /* fops->copy_file_range() is available */ /* #undef HAVE_VFS_COPY_FILE_RANGE */ /* fops->dedupe_file_range() is available */ /* #undef HAVE_VFS_DEDUPE_FILE_RANGE */ /* aops->direct_IO() uses iovec */ /* #undef HAVE_VFS_DIRECT_IO_IOVEC */ /* aops->direct_IO() uses iov_iter without rw */ /* #undef HAVE_VFS_DIRECT_IO_ITER */ /* aops->direct_IO() uses iov_iter with offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */ /* aops->direct_IO() uses iov_iter with rw and offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */ /* filemap_dirty_folio exists */ /* #undef HAVE_VFS_FILEMAP_DIRTY_FOLIO */ /* file_operations_extend takes .copy_file_range() and .clone_file_range() */ /* #undef HAVE_VFS_FILE_OPERATIONS_EXTEND */ /* generic_copy_file_range() is available */ /* #undef HAVE_VFS_GENERIC_COPY_FILE_RANGE */ /* All required iov_iter interfaces are available */ /* #undef HAVE_VFS_IOV_ITER */ /* fops->iterate() is available */ /* #undef HAVE_VFS_ITERATE */ /* fops->iterate_shared() is available */ /* #undef HAVE_VFS_ITERATE_SHARED */ /* fops->readdir() is available */ /* #undef HAVE_VFS_READDIR */ /* address_space_operations->readpages exists */ /* #undef HAVE_VFS_READPAGES */ /* read_folio exists */ /* #undef HAVE_VFS_READ_FOLIO */ /* fops->remap_file_range() is available */ /* #undef HAVE_VFS_REMAP_FILE_RANGE */ /* fops->read/write_iter() are available */ /* #undef HAVE_VFS_RW_ITERATE */ /* __set_page_dirty_nobuffers exists */ /* #undef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS */ /* __vmalloc page flags exists */ /* #undef HAVE_VMALLOC_PAGE_KERNEL */ /* yes */ /* #undef HAVE_WAIT_ON_BIT_ACTION */ /* wait_queue_entry_t exists */ /* #undef HAVE_WAIT_QUEUE_ENTRY_T */ /* wq_head->head and wq_entry->entry exist */ /* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */ /* int (*writepage_t)() takes struct folio* */ /* #undef HAVE_WRITEPAGE_T_FOLIO */ /* xattr_handler->get() wants dentry */ /* #undef HAVE_XATTR_GET_DENTRY */ /* xattr_handler->get() wants both dentry and inode */ /* #undef HAVE_XATTR_GET_DENTRY_INODE */ /* xattr_handler->get() wants dentry and inode and flags */ /* #undef HAVE_XATTR_GET_DENTRY_INODE_FLAGS */ /* xattr_handler->get() wants xattr_handler */ /* #undef HAVE_XATTR_GET_HANDLER */ /* xattr_handler has name */ /* #undef HAVE_XATTR_HANDLER_NAME */ /* xattr_handler->list() wants dentry */ /* #undef HAVE_XATTR_LIST_DENTRY */ /* xattr_handler->list() wants xattr_handler */ /* #undef HAVE_XATTR_LIST_HANDLER */ /* xattr_handler->list() wants simple */ /* #undef HAVE_XATTR_LIST_SIMPLE */ /* xattr_handler->set() wants dentry */ /* #undef HAVE_XATTR_SET_DENTRY */ /* xattr_handler->set() wants both dentry and inode */ /* #undef HAVE_XATTR_SET_DENTRY_INODE */ /* xattr_handler->set() wants xattr_handler */ /* #undef HAVE_XATTR_SET_HANDLER */ /* xattr_handler->set() takes mnt_idmap */ /* #undef HAVE_XATTR_SET_IDMAP */ /* xattr_handler->set() takes user_namespace */ /* #undef HAVE_XATTR_SET_USERNS */ /* Define if host toolchain supports XSAVE */ #define HAVE_XSAVE 1 /* Define if host toolchain supports XSAVEOPT */ #define HAVE_XSAVEOPT 1 /* Define if host toolchain supports XSAVES */ #define HAVE_XSAVES 1 /* ZERO_PAGE() is GPL-only */ /* #undef HAVE_ZERO_PAGE_GPL_ONLY */ /* Define if you have [z] */ #define HAVE_ZLIB 1 /* __posix_acl_chmod() exists */ /* #undef HAVE___POSIX_ACL_CHMOD */ /* kernel exports FPU functions */ /* #undef KERNEL_EXPORTS_X86_FPU */ /* TBD: fetch(3) support */ #if 0 /* whether the chosen libfetch is to be loaded at run-time */ #define LIBFETCH_DYNAMIC 1 /* libfetch is fetch(3) */ #define LIBFETCH_IS_FETCH 1 /* libfetch is libcurl */ #define LIBFETCH_IS_LIBCURL 0 /* soname of chosen libfetch */ #define LIBFETCH_SONAME "libfetch.so.6" #endif /* Define to the sub-directory where libtool stores uninstalled libraries. */ #define LT_OBJDIR ".libs/" /* make_request_fn() return type */ /* #undef MAKE_REQUEST_FN_RET */ /* struct shrink_control has nid */ /* #undef SHRINK_CONTROL_HAS_NID */ /* using complete_and_exit() instead */ /* #undef SPL_KTHREAD_COMPLETE_AND_EXIT */ /* Defined for legacy compatibility. */ #define SPL_META_ALIAS ZFS_META_ALIAS /* Defined for legacy compatibility. */ #define SPL_META_RELEASE ZFS_META_RELEASE /* Defined for legacy compatibility. */ #define SPL_META_VERSION ZFS_META_VERSION /* pde_data() is PDE_DATA() */ /* #undef SPL_PDE_DATA */ /* Define to 1 if all of the C90 standard headers exist (not just the ones required in a freestanding environment). This macro is provided for backward compatibility; new code need not use it. */ #define SYSTEM_FREEBSD 1 /* True if ZFS is to be compiled for a Linux system */ /* #undef SYSTEM_LINUX */ /* Version number of package */ /* #undef ZFS_DEBUG */ /* /dev/zfs minor */ /* #undef ZFS_DEVICE_MINOR */ /* enum node_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */ /* enum node_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum node_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */ /* enum zone_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */ /* enum zone_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum zone_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */ /* GENHD_FL_EXT_DEVT flag is not available */ /* #undef ZFS_GENHD_FL_EXT_DEVT */ /* GENHD_FL_NO_PART_SCAN flag is available */ /* #undef ZFS_GENHD_FL_NO_PART */ /* global_node_page_state() exists */ /* #undef ZFS_GLOBAL_NODE_PAGE_STATE */ /* global_zone_page_state() exists */ /* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */ /* Define to 1 if GPL-only symbols can be used */ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.2.0-FreeBSD_g95785196f" +#define ZFS_META_ALIAS "zfs-2.2.1-FreeBSD_g55dd24c4c" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" /* Define the project release date. */ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ -#define ZFS_META_KVER_MAX "6.5" +#define ZFS_META_KVER_MAX "6.6" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "3.10" /* Define the project license. */ #define ZFS_META_LICENSE "CDDL" /* Define the libtool library 'age' version information. */ /* #undef ZFS_META_LT_AGE */ /* Define the libtool library 'current' version information. */ /* #undef ZFS_META_LT_CURRENT */ /* Define the libtool library 'revision' version information. */ /* #undef ZFS_META_LT_REVISION */ /* Define the project name. */ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "FreeBSD_g95785196f" +#define ZFS_META_RELEASE "FreeBSD_g55dd24c4c" /* Define the project version. */ -#define ZFS_META_VERSION "2.2.0" +#define ZFS_META_VERSION "2.2.1" /* count is located in percpu_ref.data */ /* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */ diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index d1341d6a39ef..231b16fa8298 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.2.0-0-g95785196f" +#define ZFS_META_GITREV "zfs-2.2.1-0-g55dd24c4c"