diff --git a/cmd/arc_summary/arc_summary3 b/cmd/arc_summary/arc_summary3 index d9ac03fa8ec2..bd49e1b53770 100755 --- a/cmd/arc_summary/arc_summary3 +++ b/cmd/arc_summary/arc_summary3 @@ -1,972 +1,971 @@ #!/usr/bin/env python3 # # Copyright (c) 2008 Ben Rockwood , # Copyright (c) 2010 Martin Matuska , # Copyright (c) 2010-2011 Jason J. Hellenthal , # Copyright (c) 2017 Scot W. Stevenson # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. """Print statistics on the ZFS ARC Cache and other information Provides basic information on the ARC, its efficiency, the L2ARC (if present), the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the in-source documentation and code at https://github.com/openzfs/zfs/blob/master/module/zfs/arc.c for details. The original introduction to arc_summary can be found at http://cuddletech.com/?p=454 """ import argparse import os import subprocess import sys import time DESCRIPTION = 'Print ARC and other statistics for OpenZFS' INDENT = ' '*8 LINE_LENGTH = 72 DATE_FORMAT = '%a %b %d %H:%M:%S %Y' TITLE = 'ZFS Subsystem Report' SECTIONS = 'arc archits dmu l2arc spl tunables vdev zil'.split() SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')' # Tunables and SPL are handled separately because they come from # different sources SECTION_PATHS = {'arc': 'arcstats', 'dmu': 'dmu_tx', 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats 'vdev': 'vdev_cache_stats', - 'xuio': 'xuio_stats', 'zfetch': 'zfetchstats', 'zil': 'zil'} parser = argparse.ArgumentParser(description=DESCRIPTION) parser.add_argument('-a', '--alternate', action='store_true', default=False, help='use alternate formatting for tunables and SPL', dest='alt') parser.add_argument('-d', '--description', action='store_true', default=False, help='print descriptions with tunables and SPL', dest='desc') parser.add_argument('-g', '--graph', action='store_true', default=False, help='print graph on ARC use and exit', dest='graph') parser.add_argument('-p', '--page', type=int, dest='page', help='print page by number (DEPRECATED, use "-s")') parser.add_argument('-r', '--raw', action='store_true', default=False, help='dump all available data with minimal formatting', dest='raw') parser.add_argument('-s', '--section', dest='section', help=SECTION_HELP) ARGS = parser.parse_args() if sys.platform.startswith('freebsd'): # Requires py36-sysctl on FreeBSD import sysctl VDEV_CACHE_SIZE = 'vdev.cache_size' def load_kstats(section): base = 'kstat.zfs.misc.{section}.'.format(section=section) # base is removed from the name fmt = lambda kstat: '{name} : {value}'.format(name=kstat.name[len(base):], value=kstat.value) return [fmt(kstat) for kstat in sysctl.filter(base)] def get_params(base): cut = 8 # = len('vfs.zfs.') return {ctl.name[cut:]: str(ctl.value) for ctl in sysctl.filter(base)} def get_tunable_params(): return get_params('vfs.zfs') def get_vdev_params(): return get_params('vfs.zfs.vdev') def get_version_impl(request): # FreeBSD reports versions for zpl and spa instead of zfs and spl. name = {'zfs': 'zpl', 'spl': 'spa'}[request] mib = 'vfs.zfs.version.{}'.format(name) version = sysctl.filter(mib)[0].value return '{} version {}'.format(name, version) def get_descriptions(_request): # py-sysctl doesn't give descriptions, so we have to shell out. command = ['sysctl', '-d', 'vfs.zfs'] # The recommended way to do this is with subprocess.run(). However, # some installed versions of Python are < 3.5, so we offer them # the option of doing it the old way (for now) if 'run' in dir(subprocess): info = subprocess.run(command, stdout=subprocess.PIPE, universal_newlines=True) lines = info.stdout.split('\n') else: info = subprocess.check_output(command, universal_newlines=True) lines = info.split('\n') def fmt(line): name, desc = line.split(':', 1) return (name.strip(), desc.strip()) return dict([fmt(line) for line in lines if len(line) > 0]) elif sys.platform.startswith('linux'): KSTAT_PATH = '/proc/spl/kstat/zfs' SPL_PATH = '/sys/module/spl/parameters' TUNABLES_PATH = '/sys/module/zfs/parameters' VDEV_CACHE_SIZE = 'zfs_vdev_cache_size' def load_kstats(section): path = os.path.join(KSTAT_PATH, section) with open(path) as f: return list(f)[2:] # Get rid of header def get_params(basepath): """Collect information on the Solaris Porting Layer (SPL) or the tunables, depending on the PATH given. Does not check if PATH is legal. """ result = {} for name in os.listdir(basepath): path = os.path.join(basepath, name) with open(path) as f: value = f.read() result[name] = value.strip() return result def get_spl_params(): return get_params(SPL_PATH) def get_tunable_params(): return get_params(TUNABLES_PATH) def get_vdev_params(): return get_params(TUNABLES_PATH) def get_version_impl(request): # The original arc_summary called /sbin/modinfo/{spl,zfs} to get # the version information. We switch to /sys/module/{spl,zfs}/version # to make sure we get what is really loaded in the kernel command = ["cat", "/sys/module/{0}/version".format(request)] req = request.upper() # The recommended way to do this is with subprocess.run(). However, # some installed versions of Python are < 3.5, so we offer them # the option of doing it the old way (for now) if 'run' in dir(subprocess): info = subprocess.run(command, stdout=subprocess.PIPE, universal_newlines=True) version = info.stdout.strip() else: info = subprocess.check_output(command, universal_newlines=True) version = info.strip() return version def get_descriptions(request): """Get the descriptions of the Solaris Porting Layer (SPL) or the tunables, return with minimal formatting. """ if request not in ('spl', 'zfs'): print('ERROR: description of "{0}" requested)'.format(request)) sys.exit(1) descs = {} target_prefix = 'parm:' # We would prefer to do this with /sys/modules -- see the discussion at # get_version() -- but there isn't a way to get the descriptions from # there, so we fall back on modinfo command = ["/sbin/modinfo", request, "-0"] # The recommended way to do this is with subprocess.run(). However, # some installed versions of Python are < 3.5, so we offer them # the option of doing it the old way (for now) info = '' try: if 'run' in dir(subprocess): info = subprocess.run(command, stdout=subprocess.PIPE, universal_newlines=True) raw_output = info.stdout.split('\0') else: info = subprocess.check_output(command, universal_newlines=True) raw_output = info.split('\0') except subprocess.CalledProcessError: print("Error: Descriptions not available", "(can't access kernel module)") sys.exit(1) for line in raw_output: if not line.startswith(target_prefix): continue line = line[len(target_prefix):].strip() name, raw_desc = line.split(':', 1) desc = raw_desc.rsplit('(', 1)[0] if desc == '': desc = '(No description found)' descs[name.strip()] = desc.strip() return descs def cleanup_line(single_line): """Format a raw line of data from /proc and isolate the name value part, returning a tuple with each. Currently, this gets rid of the middle '4'. For example "arc_no_grow 4 0" returns the tuple ("arc_no_grow", "0"). """ name, _, value = single_line.split() return name, value def draw_graph(kstats_dict): """Draw a primitive graph representing the basic information on the ARC -- its size and the proportion used by MFU and MRU -- and quit. We use max size of the ARC to calculate how full it is. This is a very rough representation. """ arc_stats = isolate_section('arcstats', kstats_dict) GRAPH_INDENT = ' '*4 GRAPH_WIDTH = 60 arc_size = f_bytes(arc_stats['size']) arc_perc = f_perc(arc_stats['size'], arc_stats['c_max']) mfu_size = f_bytes(arc_stats['mfu_size']) mru_size = f_bytes(arc_stats['mru_size']) meta_limit = f_bytes(arc_stats['arc_meta_limit']) meta_size = f_bytes(arc_stats['arc_meta_used']) dnode_limit = f_bytes(arc_stats['arc_dnode_limit']) dnode_size = f_bytes(arc_stats['dnode_size']) info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ({5}) ' 'DNODE {6} ({7})') info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size, meta_size, meta_limit, dnode_size, dnode_limit) info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2) info_line = GRAPH_INDENT+info_spc+info_line graph_line = GRAPH_INDENT+'+'+('-'*(GRAPH_WIDTH-2))+'+' mfu_perc = float(int(arc_stats['mfu_size'])/int(arc_stats['c_max'])) mru_perc = float(int(arc_stats['mru_size'])/int(arc_stats['c_max'])) arc_perc = float(int(arc_stats['size'])/int(arc_stats['c_max'])) total_ticks = float(arc_perc)*GRAPH_WIDTH mfu_ticks = mfu_perc*GRAPH_WIDTH mru_ticks = mru_perc*GRAPH_WIDTH other_ticks = total_ticks-(mfu_ticks+mru_ticks) core_form = 'F'*int(mfu_ticks)+'R'*int(mru_ticks)+'O'*int(other_ticks) core_spc = ' '*(GRAPH_WIDTH-(2+len(core_form))) core_line = GRAPH_INDENT+'|'+core_form+core_spc+'|' for line in ('', info_line, graph_line, core_line, graph_line, ''): print(line) def f_bytes(byte_string): """Return human-readable representation of a byte value in powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal points. Values smaller than one KiB are returned without decimal points. Note "bytes" is a reserved keyword. """ prefixes = ([2**80, "YiB"], # yobibytes (yotta) [2**70, "ZiB"], # zebibytes (zetta) [2**60, "EiB"], # exbibytes (exa) [2**50, "PiB"], # pebibytes (peta) [2**40, "TiB"], # tebibytes (tera) [2**30, "GiB"], # gibibytes (giga) [2**20, "MiB"], # mebibytes (mega) [2**10, "KiB"]) # kibibytes (kilo) bites = int(byte_string) if bites >= 2**10: for limit, unit in prefixes: if bites >= limit: value = bites / limit break result = '{0:.1f} {1}'.format(value, unit) else: result = '{0} Bytes'.format(bites) return result def f_hits(hits_string): """Create a human-readable representation of the number of hits. The single-letter symbols used are SI to avoid the confusion caused by the different "short scale" and "long scale" representations in English, which use the same words for different values. See https://en.wikipedia.org/wiki/Names_of_large_numbers and: https://physics.nist.gov/cuu/Units/prefixes.html """ numbers = ([10**24, 'Y'], # yotta (septillion) [10**21, 'Z'], # zetta (sextillion) [10**18, 'E'], # exa (quintrillion) [10**15, 'P'], # peta (quadrillion) [10**12, 'T'], # tera (trillion) [10**9, 'G'], # giga (billion) [10**6, 'M'], # mega (million) [10**3, 'k']) # kilo (thousand) hits = int(hits_string) if hits >= 1000: for limit, symbol in numbers: if hits >= limit: value = hits/limit break result = "%0.1f%s" % (value, symbol) else: result = "%d" % hits return result def f_perc(value1, value2): """Calculate percentage and return in human-readable form. If rounding produces the result '0.0' though the first number is not zero, include a 'less-than' symbol to avoid confusion. Division by zero is handled by returning 'n/a'; no error is called. """ v1 = float(value1) v2 = float(value2) try: perc = 100 * v1/v2 except ZeroDivisionError: result = 'n/a' else: result = '{0:0.1f} %'.format(perc) if result == '0.0 %' and v1 > 0: result = '< 0.1 %' return result def format_raw_line(name, value): """For the --raw option for the tunable and SPL outputs, decide on the correct formatting based on the --alternate flag. """ if ARGS.alt: result = '{0}{1}={2}'.format(INDENT, name, value) else: spc = LINE_LENGTH-(len(INDENT)+len(value)) result = '{0}{1:<{spc}}{2}'.format(INDENT, name, value, spc=spc) return result def get_kstats(): """Collect information on the ZFS subsystem. The step does not perform any further processing, giving us the option to only work on what is actually needed. The name "kstat" is a holdover from the Solaris utility of the same name. """ result = {} for section in SECTION_PATHS.values(): if section not in result: result[section] = load_kstats(section) return result def get_version(request): """Get the version number of ZFS or SPL on this machine for header. Returns an error string, but does not raise an error, if we can't get the ZFS/SPL version. """ if request not in ('spl', 'zfs'): error_msg = '(ERROR: "{0}" requested)'.format(request) return error_msg return get_version_impl(request) def print_header(): """Print the initial heading with date and time as well as info on the kernel and ZFS versions. This is not called for the graph. """ # datetime is now recommended over time but we keep the exact formatting # from the older version of arc_summary in case there are scripts # that expect it in this way daydate = time.strftime(DATE_FORMAT) spc_date = LINE_LENGTH-len(daydate) sys_version = os.uname() sys_msg = sys_version.sysname+' '+sys_version.release zfs = get_version('zfs') spc_zfs = LINE_LENGTH-len(zfs) machine_msg = 'Machine: '+sys_version.nodename+' ('+sys_version.machine+')' spl = get_version('spl') spc_spl = LINE_LENGTH-len(spl) print('\n'+('-'*LINE_LENGTH)) print('{0:<{spc}}{1}'.format(TITLE, daydate, spc=spc_date)) print('{0:<{spc}}{1}'.format(sys_msg, zfs, spc=spc_zfs)) print('{0:<{spc}}{1}\n'.format(machine_msg, spl, spc=spc_spl)) def print_raw(kstats_dict): """Print all available data from the system in a minimally sorted format. This can be used as a source to be piped through 'grep'. """ sections = sorted(kstats_dict.keys()) for section in sections: print('\n{0}:'.format(section.upper())) lines = sorted(kstats_dict[section]) for line in lines: name, value = cleanup_line(line) print(format_raw_line(name, value)) # Tunables and SPL must be handled separately because they come from a # different source and have descriptions the user might request print() section_spl() section_tunables() def isolate_section(section_name, kstats_dict): """From the complete information on all sections, retrieve only those for one section. """ try: section_data = kstats_dict[section_name] except KeyError: print('ERROR: Data on {0} not available'.format(section_data)) sys.exit(1) section_dict = dict(cleanup_line(l) for l in section_data) return section_dict # Formatted output helper functions def prt_1(text, value): """Print text and one value, no indent""" spc = ' '*(LINE_LENGTH-(len(text)+len(value))) print('{0}{spc}{1}'.format(text, value, spc=spc)) def prt_i1(text, value): """Print text and one value, with indent""" spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(value))) print(INDENT+'{0}{spc}{1}'.format(text, value, spc=spc)) def prt_2(text, value1, value2): """Print text and two values, no indent""" values = '{0:>9} {1:>9}'.format(value1, value2) spc = ' '*(LINE_LENGTH-(len(text)+len(values)+2)) print('{0}{spc} {1}'.format(text, values, spc=spc)) def prt_i2(text, value1, value2): """Print text and two values, with indent""" values = '{0:>9} {1:>9}'.format(value1, value2) spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(values)+2)) print(INDENT+'{0}{spc} {1}'.format(text, values, spc=spc)) # The section output concentrates on important parameters instead of # being exhaustive (that is what the --raw parameter is for) def section_arc(kstats_dict): """Give basic information on the ARC, MRU and MFU. This is the first and most used section. """ arc_stats = isolate_section('arcstats', kstats_dict) throttle = arc_stats['memory_throttle_count'] if throttle == '0': health = 'HEALTHY' else: health = 'THROTTLED' prt_1('ARC status:', health) prt_i1('Memory throttle count:', throttle) print() arc_size = arc_stats['size'] arc_target_size = arc_stats['c'] arc_max = arc_stats['c_max'] arc_min = arc_stats['c_min'] mfu_size = arc_stats['mfu_size'] mru_size = arc_stats['mru_size'] meta_limit = arc_stats['arc_meta_limit'] meta_size = arc_stats['arc_meta_used'] dnode_limit = arc_stats['arc_dnode_limit'] dnode_size = arc_stats['dnode_size'] target_size_ratio = '{0}:1'.format(int(arc_max) // int(arc_min)) prt_2('ARC size (current):', f_perc(arc_size, arc_max), f_bytes(arc_size)) prt_i2('Target size (adaptive):', f_perc(arc_target_size, arc_max), f_bytes(arc_target_size)) prt_i2('Min size (hard limit):', f_perc(arc_min, arc_max), f_bytes(arc_min)) prt_i2('Max size (high water):', target_size_ratio, f_bytes(arc_max)) caches_size = int(mfu_size)+int(mru_size) prt_i2('Most Frequently Used (MFU) cache size:', f_perc(mfu_size, caches_size), f_bytes(mfu_size)) prt_i2('Most Recently Used (MRU) cache size:', f_perc(mru_size, caches_size), f_bytes(mru_size)) prt_i2('Metadata cache size (hard limit):', f_perc(meta_limit, arc_max), f_bytes(meta_limit)) prt_i2('Metadata cache size (current):', f_perc(meta_size, meta_limit), f_bytes(meta_size)) prt_i2('Dnode cache size (hard limit):', f_perc(dnode_limit, meta_limit), f_bytes(dnode_limit)) prt_i2('Dnode cache size (current):', f_perc(dnode_size, dnode_limit), f_bytes(dnode_size)) print() print('ARC hash breakdown:') prt_i1('Elements max:', f_hits(arc_stats['hash_elements_max'])) prt_i2('Elements current:', f_perc(arc_stats['hash_elements'], arc_stats['hash_elements_max']), f_hits(arc_stats['hash_elements'])) prt_i1('Collisions:', f_hits(arc_stats['hash_collisions'])) prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max'])) prt_i1('Chains:', f_hits(arc_stats['hash_chains'])) print() print('ARC misc:') prt_i1('Deleted:', f_hits(arc_stats['deleted'])) prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss'])) prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip'])) prt_i1('Eviction skips due to L2 writes:', f_hits(arc_stats['evict_l2_skip'])) prt_i1('L2 cached evictions:', f_bytes(arc_stats['evict_l2_cached'])) prt_i1('L2 eligible evictions:', f_bytes(arc_stats['evict_l2_eligible'])) prt_i2('L2 eligible MFU evictions:', f_perc(arc_stats['evict_l2_eligible_mfu'], arc_stats['evict_l2_eligible']), f_bytes(arc_stats['evict_l2_eligible_mfu'])) prt_i2('L2 eligible MRU evictions:', f_perc(arc_stats['evict_l2_eligible_mru'], arc_stats['evict_l2_eligible']), f_bytes(arc_stats['evict_l2_eligible_mru'])) prt_i1('L2 ineligible evictions:', f_bytes(arc_stats['evict_l2_ineligible'])) print() def section_archits(kstats_dict): """Print information on how the caches are accessed ("arc hits"). """ arc_stats = isolate_section('arcstats', kstats_dict) all_accesses = int(arc_stats['hits'])+int(arc_stats['misses']) actual_hits = int(arc_stats['mfu_hits'])+int(arc_stats['mru_hits']) prt_1('ARC total accesses (hits + misses):', f_hits(all_accesses)) ta_todo = (('Cache hit ratio:', arc_stats['hits']), ('Cache miss ratio:', arc_stats['misses']), ('Actual hit ratio (MFU + MRU hits):', actual_hits)) for title, value in ta_todo: prt_i2(title, f_perc(value, all_accesses), f_hits(value)) dd_total = int(arc_stats['demand_data_hits']) +\ int(arc_stats['demand_data_misses']) prt_i2('Data demand efficiency:', f_perc(arc_stats['demand_data_hits'], dd_total), f_hits(dd_total)) dp_total = int(arc_stats['prefetch_data_hits']) +\ int(arc_stats['prefetch_data_misses']) prt_i2('Data prefetch efficiency:', f_perc(arc_stats['prefetch_data_hits'], dp_total), f_hits(dp_total)) known_hits = int(arc_stats['mfu_hits']) +\ int(arc_stats['mru_hits']) +\ int(arc_stats['mfu_ghost_hits']) +\ int(arc_stats['mru_ghost_hits']) anon_hits = int(arc_stats['hits'])-known_hits print() print('Cache hits by cache type:') cl_todo = (('Most frequently used (MFU):', arc_stats['mfu_hits']), ('Most recently used (MRU):', arc_stats['mru_hits']), ('Most frequently used (MFU) ghost:', arc_stats['mfu_ghost_hits']), ('Most recently used (MRU) ghost:', arc_stats['mru_ghost_hits'])) for title, value in cl_todo: prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value)) # For some reason, anon_hits can turn negative, which is weird. Until we # have figured out why this happens, we just hide the problem, following # the behavior of the original arc_summary. if anon_hits >= 0: prt_i2('Anonymously used:', f_perc(anon_hits, arc_stats['hits']), f_hits(anon_hits)) print() print('Cache hits by data type:') dt_todo = (('Demand data:', arc_stats['demand_data_hits']), ('Demand prefetch data:', arc_stats['prefetch_data_hits']), ('Demand metadata:', arc_stats['demand_metadata_hits']), ('Demand prefetch metadata:', arc_stats['prefetch_metadata_hits'])) for title, value in dt_todo: prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value)) print() print('Cache misses by data type:') dm_todo = (('Demand data:', arc_stats['demand_data_misses']), ('Demand prefetch data:', arc_stats['prefetch_data_misses']), ('Demand metadata:', arc_stats['demand_metadata_misses']), ('Demand prefetch metadata:', arc_stats['prefetch_metadata_misses'])) for title, value in dm_todo: prt_i2(title, f_perc(value, arc_stats['misses']), f_hits(value)) print() def section_dmu(kstats_dict): """Collect information on the DMU""" zfetch_stats = isolate_section('zfetchstats', kstats_dict) zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses']) prt_1('DMU prefetch efficiency:', f_hits(zfetch_access_total)) prt_i2('Hit ratio:', f_perc(zfetch_stats['hits'], zfetch_access_total), f_hits(zfetch_stats['hits'])) prt_i2('Miss ratio:', f_perc(zfetch_stats['misses'], zfetch_access_total), f_hits(zfetch_stats['misses'])) print() def section_l2arc(kstats_dict): """Collect information on L2ARC device if present. If not, tell user that we're skipping the section. """ # The L2ARC statistics live in the same section as the normal ARC stuff arc_stats = isolate_section('arcstats', kstats_dict) if arc_stats['l2_size'] == '0': print('L2ARC not detected, skipping section\n') return l2_errors = int(arc_stats['l2_writes_error']) +\ int(arc_stats['l2_cksum_bad']) +\ int(arc_stats['l2_io_error']) l2_access_total = int(arc_stats['l2_hits'])+int(arc_stats['l2_misses']) health = 'HEALTHY' if l2_errors > 0: health = 'DEGRADED' prt_1('L2ARC status:', health) l2_todo = (('Low memory aborts:', 'l2_abort_lowmem'), ('Free on write:', 'l2_free_on_write'), ('R/W clashes:', 'l2_rw_clash'), ('Bad checksums:', 'l2_cksum_bad'), ('I/O errors:', 'l2_io_error')) for title, value in l2_todo: prt_i1(title, f_hits(arc_stats[value])) print() prt_1('L2ARC size (adaptive):', f_bytes(arc_stats['l2_size'])) prt_i2('Compressed:', f_perc(arc_stats['l2_asize'], arc_stats['l2_size']), f_bytes(arc_stats['l2_asize'])) prt_i2('Header size:', f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']), f_bytes(arc_stats['l2_hdr_size'])) prt_i2('MFU allocated size:', f_perc(arc_stats['l2_mfu_asize'], arc_stats['l2_asize']), f_bytes(arc_stats['l2_mfu_asize'])) prt_i2('MRU allocated size:', f_perc(arc_stats['l2_mru_asize'], arc_stats['l2_asize']), f_bytes(arc_stats['l2_mru_asize'])) prt_i2('Prefetch allocated size:', f_perc(arc_stats['l2_prefetch_asize'], arc_stats['l2_asize']), f_bytes(arc_stats['l2_prefetch_asize'])) prt_i2('Data (buffer content) allocated size:', f_perc(arc_stats['l2_bufc_data_asize'], arc_stats['l2_asize']), f_bytes(arc_stats['l2_bufc_data_asize'])) prt_i2('Metadata (buffer content) allocated size:', f_perc(arc_stats['l2_bufc_metadata_asize'], arc_stats['l2_asize']), f_bytes(arc_stats['l2_bufc_metadata_asize'])) print() prt_1('L2ARC breakdown:', f_hits(l2_access_total)) prt_i2('Hit ratio:', f_perc(arc_stats['l2_hits'], l2_access_total), f_hits(arc_stats['l2_hits'])) prt_i2('Miss ratio:', f_perc(arc_stats['l2_misses'], l2_access_total), f_hits(arc_stats['l2_misses'])) prt_i1('Feeds:', f_hits(arc_stats['l2_feeds'])) print() print('L2ARC writes:') if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']: prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent'])) prt_i2('Done ratio:', f_perc(arc_stats['l2_writes_done'], arc_stats['l2_writes_sent']), f_hits(arc_stats['l2_writes_done'])) prt_i2('Error ratio:', f_perc(arc_stats['l2_writes_error'], arc_stats['l2_writes_sent']), f_hits(arc_stats['l2_writes_error'])) else: prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent'])) print() print('L2ARC evicts:') prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry'])) prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading'])) print() def section_spl(*_): """Print the SPL parameters, if requested with alternative format and/or descriptions. This does not use kstats. """ if sys.platform.startswith('freebsd'): # No SPL support in FreeBSD return spls = get_spl_params() keylist = sorted(spls.keys()) print('Solaris Porting Layer (SPL):') if ARGS.desc: descriptions = get_descriptions('spl') for key in keylist: value = spls[key] if ARGS.desc: try: print(INDENT+'#', descriptions[key]) except KeyError: print(INDENT+'# (No description found)') # paranoid print(format_raw_line(key, value)) print() def section_tunables(*_): """Print the tunables, if requested with alternative format and/or descriptions. This does not use kstasts. """ tunables = get_tunable_params() keylist = sorted(tunables.keys()) print('Tunables:') if ARGS.desc: descriptions = get_descriptions('zfs') for key in keylist: value = tunables[key] if ARGS.desc: try: print(INDENT+'#', descriptions[key]) except KeyError: print(INDENT+'# (No description found)') # paranoid print(format_raw_line(key, value)) print() def section_vdev(kstats_dict): """Collect information on VDEV caches""" # Currently [Nov 2017] the VDEV cache is disabled, because it is actually # harmful. When this is the case, we just skip the whole entry. See # https://github.com/openzfs/zfs/blob/master/module/zfs/vdev_cache.c # for details tunables = get_vdev_params() if tunables[VDEV_CACHE_SIZE] == '0': print('VDEV cache disabled, skipping section\n') return vdev_stats = isolate_section('vdev_cache_stats', kstats_dict) vdev_cache_total = int(vdev_stats['hits']) +\ int(vdev_stats['misses']) +\ int(vdev_stats['delegations']) prt_1('VDEV cache summary:', f_hits(vdev_cache_total)) prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total), f_hits(vdev_stats['hits'])) prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total), f_hits(vdev_stats['misses'])) prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total), f_hits(vdev_stats['delegations'])) print() def section_zil(kstats_dict): """Collect information on the ZFS Intent Log. Some of the information taken from https://github.com/openzfs/zfs/blob/master/include/sys/zil.h """ zil_stats = isolate_section('zil', kstats_dict) prt_1('ZIL committed transactions:', f_hits(zil_stats['zil_itx_count'])) prt_i1('Commit requests:', f_hits(zil_stats['zil_commit_count'])) prt_i1('Flushes to stable storage:', f_hits(zil_stats['zil_commit_writer_count'])) prt_i2('Transactions to SLOG storage pool:', f_bytes(zil_stats['zil_itx_metaslab_slog_bytes']), f_hits(zil_stats['zil_itx_metaslab_slog_count'])) prt_i2('Transactions to non-SLOG storage pool:', f_bytes(zil_stats['zil_itx_metaslab_normal_bytes']), f_hits(zil_stats['zil_itx_metaslab_normal_count'])) print() section_calls = {'arc': section_arc, 'archits': section_archits, 'dmu': section_dmu, 'l2arc': section_l2arc, 'spl': section_spl, 'tunables': section_tunables, 'vdev': section_vdev, 'zil': section_zil} def main(): """Run program. The options to draw a graph and to print all data raw are treated separately because they come with their own call. """ kstats = get_kstats() if ARGS.graph: draw_graph(kstats) sys.exit(0) print_header() if ARGS.raw: print_raw(kstats) elif ARGS.section: try: section_calls[ARGS.section](kstats) except KeyError: print('Error: Section "{0}" unknown'.format(ARGS.section)) sys.exit(1) elif ARGS.page: print('WARNING: Pages are deprecated, please use "--section"\n') pages_to_calls = {1: 'arc', 2: 'archits', 3: 'l2arc', 4: 'dmu', 5: 'vdev', 6: 'tunables'} try: call = pages_to_calls[ARGS.page] except KeyError: print('Error: Page "{0}" not supported'.format(ARGS.page)) sys.exit(1) else: section_calls[call](kstats) else: # If no parameters were given, we print all sections. We might want to # change the sequence by hand calls = sorted(section_calls.keys()) for section in calls: section_calls[section](kstats) sys.exit(0) if __name__ == '__main__': main() diff --git a/include/os/freebsd/spl/sys/uio.h b/include/os/freebsd/spl/sys/uio.h index d9dab652c3dd..81c7ab64ca2b 100644 --- a/include/os/freebsd/spl/sys/uio.h +++ b/include/os/freebsd/spl/sys/uio.h @@ -1,115 +1,94 @@ /* * Copyright (c) 2010 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_UIO_H_ #define _OPENSOLARIS_SYS_UIO_H_ #ifndef _STANDALONE #include_next #include #include #define uio_loffset uio_offset typedef struct uio uio_t; typedef struct iovec iovec_t; typedef enum uio_seg uio_seg_t; -typedef enum xuio_type { - UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY -} xuio_type_t; - -typedef struct xuio { - uio_t xu_uio; - - /* Extended uio fields */ - enum xuio_type xu_type; /* What kind of uio structure? */ - union { - struct { - int xu_zc_rw; - void *xu_zc_priv; - } xu_zc; - } xu_ext; -} xuio_t; - -#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv -#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw - static __inline int zfs_uiomove(void *cp, size_t n, enum uio_rw dir, uio_t *uio) { ASSERT(uio->uio_rw == dir); return (uiomove(cp, (int)n, uio)); } #define uiomove(cp, n, dir, uio) zfs_uiomove((cp), (n), (dir), (uio)) int uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes); void uioskip(uio_t *uiop, size_t n); #define uio_segflg(uio) (uio)->uio_segflg #define uio_offset(uio) (uio)->uio_loffset #define uio_resid(uio) (uio)->uio_resid #define uio_iovcnt(uio) (uio)->uio_iovcnt #define uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len #define uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base #define uio_fault_disable(uio, set) static inline void uio_iov_at_index(uio_t *uio, uint_t idx, void **base, uint64_t *len) { *base = uio_iovbase(uio, idx); *len = uio_iovlen(uio, idx); } static inline void uio_advance(uio_t *uio, size_t size) { uio->uio_resid -= size; uio->uio_loffset += size; } static inline offset_t uio_index_at_offset(uio_t *uio, offset_t off, uint_t *vec_idx) { *vec_idx = 0; while (*vec_idx < uio_iovcnt(uio) && off >= uio_iovlen(uio, *vec_idx)) { off -= uio_iovlen(uio, *vec_idx); (*vec_idx)++; } return (off); } #endif /* !_STANDALONE */ #endif /* !_OPENSOLARIS_SYS_UIO_H_ */ diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index ce3b38fcde2c..9bd358dc3d83 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -1,143 +1,100 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_UIO_H #define _SPL_UIO_H #include #include #include #include #include #include #include #include typedef struct iovec iovec_t; typedef enum uio_rw { UIO_READ = 0, UIO_WRITE = 1, } uio_rw_t; typedef enum uio_seg { UIO_USERSPACE = 0, UIO_SYSSPACE = 1, UIO_USERISPACE = 2, UIO_BVEC = 3, } uio_seg_t; typedef struct uio { union { const struct iovec *uio_iov; const struct bio_vec *uio_bvec; }; int uio_iovcnt; offset_t uio_loffset; uio_seg_t uio_segflg; boolean_t uio_fault_disable; uint16_t uio_fmode; uint16_t uio_extflg; ssize_t uio_resid; size_t uio_skip; } uio_t; -typedef struct aio_req { - uio_t *aio_uio; - void *aio_private; -} aio_req_t; - -typedef enum xuio_type { - UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY, -} xuio_type_t; - - -#define UIOA_IOV_MAX 16 - -typedef struct uioa_page_s { - int uioa_pfncnt; - void **uioa_ppp; - caddr_t uioa_base; - size_t uioa_len; -} uioa_page_t; - -typedef struct xuio { - uio_t xu_uio; - enum xuio_type xu_type; - union { - struct { - uint32_t xu_a_state; - ssize_t xu_a_mbytes; - uioa_page_t *xu_a_lcur; - void **xu_a_lppp; - void *xu_a_hwst[4]; - uioa_page_t xu_a_locked[UIOA_IOV_MAX]; - } xu_aio; - - struct { - int xu_zc_rw; - void *xu_zc_priv; - } xu_zc; - } xu_ext; -} xuio_t; - -#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv -#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw - #define uio_segflg(uio) (uio)->uio_segflg #define uio_offset(uio) (uio)->uio_loffset #define uio_resid(uio) (uio)->uio_resid #define uio_iovcnt(uio) (uio)->uio_iovcnt #define uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len #define uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base #define uio_fault_disable(uio, set) (uio)->uio_fault_disable = set static inline void uio_iov_at_index(uio_t *uio, uint_t idx, void **base, uint64_t *len) { *base = uio_iovbase(uio, idx); *len = uio_iovlen(uio, idx); } static inline void uio_advance(uio_t *uio, size_t size) { uio->uio_resid -= size; uio->uio_loffset += size; } static inline offset_t uio_index_at_offset(uio_t *uio, offset_t off, uint_t *vec_idx) { *vec_idx = 0; while (*vec_idx < uio_iovcnt(uio) && off >= uio_iovlen(uio, *vec_idx)) { off -= uio_iovlen(uio, *vec_idx); (*vec_idx)++; } return (off); } #endif /* SPL_UIO_H */ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 54fdbc9ad227..0c50d0409b2b 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1,1089 +1,1077 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_H #define _SYS_DMU_H /* * This file describes the interface that the DMU provides for its * consumers. * * The DMU also interacts with the SPA. That interface is described in * dmu_spa.h. */ #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct page; struct vnode; struct spa; struct zilog; struct zio; struct blkptr; struct zap_cursor; struct dsl_dataset; struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; struct zbookmark_phys; struct spa; struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; struct dsl_crypto_params; struct locked_range; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; typedef struct dnode dnode_t; typedef enum dmu_object_byteswap { DMU_BSWAP_UINT8, DMU_BSWAP_UINT16, DMU_BSWAP_UINT32, DMU_BSWAP_UINT64, DMU_BSWAP_ZAP, DMU_BSWAP_DNODE, DMU_BSWAP_OBJSET, DMU_BSWAP_ZNODE, DMU_BSWAP_OLDACL, DMU_BSWAP_ACL, /* * Allocating a new byteswap type number makes the on-disk format * incompatible with any other format that uses the same number. * * Data can usually be structured to work with one of the * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. */ DMU_BSWAP_NUMFUNCS } dmu_object_byteswap_t; #define DMU_OT_NEWTYPE 0x80 #define DMU_OT_METADATA 0x40 #define DMU_OT_ENCRYPTED 0x20 #define DMU_OT_BYTESWAP_MASK 0x1f /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data * (dmu_object_byteswap_t). All of the types created by this method * are cached in the dbuf metadata cache. */ #define DMU_OT(byteswap, metadata, encrypted) \ (DMU_OT_NEWTYPE | \ ((metadata) ? DMU_OT_METADATA : 0) | \ ((encrypted) ? DMU_OT_ENCRYPTED : 0) | \ ((byteswap) & DMU_OT_BYTESWAP_MASK)) #define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ (ot) < DMU_OT_NUMTYPES) #define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) /* * MDB doesn't have dmu_ot; it defines these macros itself. */ #ifndef ZFS_MDB #define DMU_OT_IS_METADATA_IMPL(ot) (dmu_ot[ot].ot_metadata) #define DMU_OT_IS_ENCRYPTED_IMPL(ot) (dmu_ot[ot].ot_encrypt) #define DMU_OT_BYTESWAP_IMPL(ot) (dmu_ot[ot].ot_byteswap) #endif #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_METADATA) : \ DMU_OT_IS_METADATA_IMPL(ot)) #define DMU_OT_IS_DDT(ot) \ ((ot) == DMU_OT_DDT_ZAP) #define DMU_OT_IS_ZIL(ot) \ ((ot) == DMU_OT_INTENT_LOG) /* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */ #define DMU_OT_IS_FILE(ot) \ ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) #define DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_ENCRYPTED) : \ DMU_OT_IS_ENCRYPTED_IMPL(ot)) /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill * is repurposed for embedded BPs. */ #define DMU_OT_HAS_FILL(ot) \ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ DMU_OT_BYTESWAP_IMPL(ot)) typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ DMU_OT_OBJECT_DIRECTORY, /* ZAP */ DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ DMU_OT_BPOBJ, /* UINT64 */ DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ /* zil: */ DMU_OT_INTENT_LOG, /* UINT64 */ /* dmu: */ DMU_OT_DNODE, /* DNODE */ DMU_OT_OBJSET, /* OBJSET */ /* dsl: */ DMU_OT_DSL_DIR, /* UINT64 */ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ DMU_OT_DSL_PROPS, /* ZAP */ DMU_OT_DSL_DATASET, /* UINT64 */ /* zpl: */ DMU_OT_ZNODE, /* ZNODE */ DMU_OT_OLDACL, /* Old ACL */ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ DMU_OT_MASTER_NODE, /* ZAP */ DMU_OT_UNLINKED_SET, /* ZAP */ /* zvol: */ DMU_OT_ZVOL, /* UINT8 */ DMU_OT_ZVOL_PROP, /* ZAP */ /* other; for testing only! */ DMU_OT_PLAIN_OTHER, /* UINT8 */ DMU_OT_UINT64_OTHER, /* UINT64 */ DMU_OT_ZAP_OTHER, /* ZAP */ /* new object types: */ DMU_OT_ERROR_LOG, /* ZAP */ DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ DMU_OT_DSL_PERMS, /* ZAP */ DMU_OT_ACL, /* ACL */ DMU_OT_SYSACL, /* SYSACL */ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ DMU_OT_DDT_ZAP, /* ZAP */ DMU_OT_DDT_STATS, /* ZAP */ DMU_OT_SA, /* System attr */ DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_SCAN_XLATE, /* ZAP */ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_DEADLIST, /* ZAP */ DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ /* * Do not allocate new object types here. Doing so makes the on-disk * format incompatible with any other format that uses the same object * type number. * * When creating an object which does not have one of the above types * use the DMU_OTN_* type with the correct byteswap and metadata * values. * * The DMU_OTN_* types do not have entries in the dmu_ot table, * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead * of indexing into dmu_ot directly (this works for both DMU_OT_* types * and DMU_OTN_* types). */ DMU_OT_NUMTYPES, /* * Names for valid types declared with DMU_OT(). */ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_FALSE), DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_FALSE), DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_FALSE), DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_FALSE), DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_FALSE), DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_FALSE), DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_FALSE), DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_FALSE), DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_FALSE), DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_FALSE), DMU_OTN_UINT8_ENC_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_TRUE), DMU_OTN_UINT8_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_TRUE), DMU_OTN_UINT16_ENC_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_TRUE), DMU_OTN_UINT16_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_TRUE), DMU_OTN_UINT32_ENC_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_TRUE), DMU_OTN_UINT32_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_TRUE), DMU_OTN_UINT64_ENC_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_TRUE), DMU_OTN_UINT64_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_TRUE), DMU_OTN_ZAP_ENC_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_TRUE), DMU_OTN_ZAP_ENC_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_TRUE), } dmu_object_type_t; /* * These flags are intended to be used to specify the "txg_how" * parameter when calling the dmu_tx_assign() function. See the comment * above dmu_tx_assign() for more details on the meaning of these flags. */ #define TXG_NOWAIT (0ULL) #define TXG_WAIT (1ULL<<0) #define TXG_NOTHROTTLE (1ULL<<1) void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); void byteswap_uint8_array(void *buf, size_t size); void zap_byteswap(void *buf, size_t size); void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) #define DS_FIND_SERIALIZE (1<<2) /* * The maximum number of bytes that can be accessed as part of one * operation, including metadata. */ #define DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) #define DMU_PROJECTUSED_OBJECT (-3ULL) /* * Zap prefix for object accounting in DMU_{USER,GROUP,PROJECT}USED_OBJECT. */ #define DMU_OBJACCT_PREFIX "obj-" #define DMU_OBJACCT_PREFIX_LEN 4 /* * artificial blkids for bonus buffer and spill blocks */ #define DMU_BONUS_BLKID (-1ULL) #define DMU_SPILL_BLKID (-2ULL) /* * Public routines to create, destroy, open, and close objsets. */ typedef void dmu_objset_create_sync_func_t(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, boolean_t key_required, void *tag, objset_t **osp); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, boolean_t key_required, void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, struct dsl_crypto_params *dcp, dmu_objset_create_sync_func_t func, void *arg); int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); int dmu_objset_find(const char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ void *db_data; /* data in buffer */ } dmu_buf_t; /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" #define DMU_POOL_FEATURES_FOR_READ "features_for_read" #define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" #define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" #define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map" #define DMU_POOL_REMOVING "com.delphix:removing" #define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" #define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap" #define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones" /* * Allocate an object from this objset. The range of object numbers * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. * * The transaction must be assigned to a txg. The newly allocated * object will be "held" in the transaction (ie. you can modify the * newly allocated object in this transaction). * * dmu_object_alloc() chooses an object and returns it in *objectp. * * dmu_object_claim() allocates a specific object number. If that * number is already allocated, it fails and returns EEXIST. * * Return 0 on success, or ENOSPC or EEXIST as specified above. */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, int dnodesize, dmu_tx_t *tx); uint64_t dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, int dnodesize, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *tx); int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Free an object from this objset. * * The object's data will be freed as well (ie. you don't need to call * dmu_free(object, 0, -1, tx)). * * The object need not be held in the transaction. * * If there are any holds on this object's buffers (via dmu_buf_hold()), * or tx holds on the object (via dmu_tx_hold_object()), you can not * free it; it fails and returns EBUSY. * * If the object is not allocated, it fails and returns ENOENT. * * Return 0 on success, or EBUSY or ENOENT as specified above. */ int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Find the next allocated or free object. * * The objectp parameter is in-out. It will be updated to be the next * object which is allocated. Ignore objects which have not been * modified since txg. * * XXX Can only be called on a objset with no dirty data. * * Returns 0 on success, or ENOENT if there are no more objects. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg); /* * Set the number of levels on a dnode. nlevels must be greater than the * current number of levels or an EINVAL will be returned. */ int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx); /* * Set the data blocksize for an object. * * The object cannot have any blocks allocated beyond the first. If * the first block is allocated already, the new size must be greater * than the current block size. If these conditions are not met, * ENOTSUP will be returned. * * Returns 0 on success, or EBUSY if there are any holds on the object * contents, or ENOTSUP as described above. */ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx); /* * Manually set the maxblkid on a dnode. This will adjust nlevels accordingly * to accommodate the change. When calling this function, the caller must * ensure that the object's nlevels can sufficiently support the new maxblkid. */ int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, dmu_tx_t *tx); /* * Set the checksum property on a dnode. The new checksum algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx); /* * Set the compress property on a dnode. The new compression algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. */ #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus * data. As with any normal buffer, you must call dmu_buf_will_dirty() * before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release what you hold with dmu_buf_rele(). * * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp); int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp, uint32_t flags); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* * Special spill buffer support used by "SA" framework */ int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so * that it will remain in memory. You must release the hold with * dmu_buf_rele(). You must not access the dmu_buf_t after releasing * what you hold. You must have a hold on any dmu_buf_t* you pass to the DMU. * * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill * on the returned buffer before reading or writing the buffer's * db_data. The comments for those routines describe what particular * operations are valid after calling them. * * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags); int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. */ void dmu_buf_add_ref(dmu_buf_t *db, void* tag); /* * Attempt to add a reference to a dmu buffer that is in an unknown state, * using a pointer that may have been invalidated by eviction processing. * The request will succeed if the passed in dbuf still represents the * same os/object/blkid, is ineligible for eviction, and has at least * one hold by a user other than the syncer. */ boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, uint64_t blkid, void *tag); void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); uint64_t dmu_buf_user_refcount(dmu_buf_t *db); /* * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a * range of an object. A pointer to an array of dmu_buf_t*'s is * returned (in *dbpp). * * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and * frees the array. The hold on the array of buffers MUST be released * with dmu_buf_rele_array. You can NOT release the hold on each buffer * individually with dmu_buf_rele. */ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); typedef void dmu_buf_evict_func_t(void *user_ptr); /* * A DMU buffer user object may be associated with a dbuf for the * duration of its lifetime. This allows the user of a dbuf (client) * to attach private data to a dbuf (e.g. in-core only data such as a * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified * when that dbuf has been evicted. Clients typically respond to the * eviction notification by freeing their private data, thus ensuring * the same lifetime for both dbuf and private data. * * The mapping from a dmu_buf_user_t to any client private data is the * client's responsibility. All current consumers of the API with private * data embed a dmu_buf_user_t as the first member of the structure for * their private data. This allows conversions between the two types * with a simple cast. Since the DMU buf user API never needs access * to the private data, other strategies can be employed if necessary * or convenient for the client (e.g. using container_of() to do the * conversion for private data that cannot have the dmu_buf_user_t as * its first member). * * Eviction callbacks are executed without the dbuf mutex held or any * other type of mechanism to guarantee that the dbuf is still available. * For this reason, users must assume the dbuf has already been freed * and not reference the dbuf from the callback context. * * Users requesting "immediate eviction" are notified as soon as the dbuf * is only referenced by dirty records (dirties == holds). Otherwise the * notification occurs after eviction processing for the dbuf begins. */ typedef struct dmu_buf_user { /* * Asynchronous user eviction callback state. */ taskq_ent_t dbu_tqent; /* * This instance's eviction function pointers. * * dbu_evict_func_sync is called synchronously and then * dbu_evict_func_async is executed asynchronously on a taskq. */ dmu_buf_evict_func_t *dbu_evict_func_sync; dmu_buf_evict_func_t *dbu_evict_func_async; #ifdef ZFS_DEBUG /* * Pointer to user's dbuf pointer. NULL for clients that do * not associate a dbuf with their user data. * * The dbuf pointer is cleared upon eviction so as to catch * use-after-evict bugs in clients. */ dmu_buf_t **dbu_clear_on_evict_dbufp; #endif } dmu_buf_user_t; /* * Initialize the given dmu_buf_user_t instance with the eviction function * evict_func, to be called when the user is evicted. * * NOTE: This function should only be called once on a given dmu_buf_user_t. * To allow enforcement of this, dbu must already be zeroed on entry. */ /*ARGSUSED*/ static inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp __maybe_unused) { ASSERT(dbu->dbu_evict_func_sync == NULL); ASSERT(dbu->dbu_evict_func_async == NULL); /* must have at least one evict func */ IMPLY(evict_func_sync == NULL, evict_func_async != NULL); dbu->dbu_evict_func_sync = evict_func_sync; dbu->dbu_evict_func_async = evict_func_async; taskq_init_ent(&dbu->dbu_tqent); #ifdef ZFS_DEBUG dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; #endif } /* * Attach user data to a dbuf and mark it for normal (when the dbuf's * data is cleared or its reference count goes to zero) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Attach user data to a dbuf and mark it for immediate (its dirty and * reference counts are equal) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); /* * Replace the current user of a dbuf. * * If given the current user of a dbuf, replaces the dbuf's user with * "new_user" and returns the user data pointer that was replaced. * Otherwise returns the current, and unmodified, dbuf user pointer. */ void *dmu_buf_replace_user(dmu_buf_t *db, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); /* * Remove the specified user data for a DMU buffer. * * Returns the user that was removed on success, or the current user if * another user currently owns the buffer. */ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); objset_t *dmu_buf_get_objset(dmu_buf_t *db); dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); void dmu_buf_dnode_exit(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); /* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called * dmu_tx_assign()). The buffer's object must be held in the tx * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); /* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has * been assigned, you can modify buffers which belong to held objects as * part of this transaction. You can't modify buffers before the * transaction has been assigned; you can't modify buffers which don't * belong to objects which this transaction holds; you can't hold * objects once the transaction has been assigned. You may hold an * object which you are going to free (with dmu_object_free()), but you * don't have to. * * You can abort the transaction before it has been assigned. * * Note that you may hold buffers (with dmu_buf_hold) at any time, * regardless of transaction state. */ #define DMU_NEW_OBJECT (-1ULL) #define DMU_OBJECT_END (-1ULL) dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn); void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); /* * To register a commit callback, dmu_tx_callback_register() must be called. * * dcb_data is a pointer to caller private data that is passed on as a * callback parameter. The caller is responsible for properly allocating and * freeing it. * * When registering a callback, the transaction must be already created, but * it cannot be committed or aborted. It can be assigned to a txg or not. * * The callback will be called after the transaction has been safely written * to stable storage and will also be called if the dmu_tx is aborted. * If there is any error which prevents the transaction from being committed to * disk, the callback will be called with a value of error != 0. * * When multiple callbacks are registered to the transaction, the callbacks * will be called in reverse order to let Lustre, the only user of commit * callback currently, take the fast path of its commit callback handling. */ typedef void dmu_tx_callback_func_t(void *dcb_data, int error); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); void dmu_tx_do_callbacks(list_t *cb_list, int error); /* * Free up the data blocks for a defined range of a file. If size is * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); /* * Convenience functions. * * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ #define DMU_READ_NO_DECRYPT 2 /* don't decrypt */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size, dmu_tx_t *tx); #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); #define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf -#ifdef HAVE_UIO_ZEROCOPY -int dmu_xuio_init(struct xuio *uio, int niov); -void dmu_xuio_fini(struct xuio *uio); -int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, - size_t n); -int dmu_xuio_cnt(struct xuio *uio); -struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); -void dmu_xuio_clear(struct xuio *uio, int i); -#endif /* HAVE_UIO_ZEROCOPY */ -void xuio_stat_wbuf_copied(void); -void xuio_stat_wbuf_nocopy(void); - extern int zfs_prefetch_disable; extern int zfs_max_recordsize; /* * Asynchronously try to read in the data. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_nblkptr; uint8_t doi_pad[4]; uint64_t doi_dnodesize; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void (*const arc_byteswap_func_t)(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; boolean_t ot_dbuf_metadata_cache; boolean_t ot_encrypt; char *ot_name; } dmu_object_type_info_t; typedef const struct dmu_object_byteswap_info { arc_byteswap_func_t ob_func; char *ob_name; } dmu_object_byteswap_info_t; extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. * * Return 0 on success or ENOENT if object is not allocated. * * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* * Like dmu_object_info_from_db, but faster still when you only care about * the size. */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize); typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; uint64_t dds_guid; dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; uint8_t dds_redacted; char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; } dmu_objset_stats_t; /* * Get stats on a dataset. */ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); /* * Add entries to the nvlist for all the objset's properties. See * zfs_prop_table[] and zfs(1m) for details on the properties. */ void dmu_objset_stats(objset_t *os, struct nvlist *nv); /* * Get the space usage statistics for statvfs(). * * refdbytes is the amount of space "referenced" by this objset. * availbytes is the amount of space available to this objset, taking * into account quotas & reservations, assuming that no other objsets * use the space first. These values correspond to the 'referenced' and * 'available' properties, described in the zfs(1m) manpage. * * usedobjs and availobjs are the number of objects currently allocated, * and available. */ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); /* * The fsid_guid is a 56-bit ID that can change to avoid collisions. * (Contrast with the ds_guid which is a 64-bit ID that will never * change, so there is a small probability that it will collide.) */ uint64_t dmu_objset_fsid_guid(objset_t *os); /* * Get the [cm]time for an objset's snapshot dir */ inode_timespec_t dmu_objset_snap_cmtime(objset_t *os); int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); extern struct zilog *dmu_objset_zil(objset_t *os); extern struct dsl_pool *dmu_objset_pool(objset_t *os); extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); extern uint64_t dmu_objset_dnodesize(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); extern int dmu_objset_blksize(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val); extern int dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); typedef struct zfs_file_info { uint64_t zfi_user; uint64_t zfi_group; uint64_t zfi_project; uint64_t zfi_generation; } zfs_file_info_t; typedef int file_info_cb_t(dmu_object_type_t bonustype, const void *data, struct zfs_file_info *zoi); extern void dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. */ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* * Synchronous write. * If a parent zio is provided this function initiates a write on the * provided buffer as a child of the parent zio. * In the absence of a parent zio, the write is completed synchronously. * At write completion, blk is filled with the bp of the written block. * Note that while the data covered by this function will be on stable * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ /* * {zfs,zvol,ztest}_get_done() args */ typedef struct zgd { struct lwb *zgd_lwb; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct zfs_locked_range *zgd_lr; void *zgd_private; } zgd_t; typedef void dmu_sync_cb_t(zgd_t *arg, int error); int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off * Return found offset in *off. Return ESRCH for end of file. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); /* * Initial setup and final teardown. */ extern void dmu_init(void); extern void dmu_fini(void); typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, uint64_t object, uint64_t offset, int len); void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_diff(const char *tosnap_name, const char *fromsnap_name, zfs_file_t *fp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; #ifdef __cplusplus } #endif #endif /* _SYS_DMU_H */ diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index 0c6273a3a727..def4aadba1d0 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -1,264 +1,257 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2013, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H #define _SYS_DMU_IMPL_H #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * This is the locking strategy for the DMU. Numbers in parenthesis are * cases that use that lock order, referenced below: * * ARC is self-contained * bplist is self-contained * refcount is self-contained * txg is self-contained (hopefully!) * zst_lock * zf_rwlock * * XXX try to improve evicting path? * * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs * * dp_config_rwlock * must be held before: everything * protects dd namespace changes * protects property changes globally * held from: * dsl_dir_open/r: * dsl_dir_create_sync/w: * dsl_dir_sync_destroy/w: * dsl_dir_rename_sync/w: * dsl_prop_changed_notify/r: * * os_obj_lock * must be held before: * everything except dp_config_rwlock * protects os_obj_next * held from: * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock * * dn_struct_rwlock * must be held before: * everything except dp_config_rwlock and os_obj_lock * protects structure of dnode (eg. nlevels) * db_blkptr can change when syncing out change to nlevels * dn_maxblkid * dn_nlevels * dn_*blksz* * phys nlevels, maxblkid, physical blkptr_t's (?) * held from: * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) * dbuf_read_impl: db_mtx, dmu_zfetch() * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() * dbuf_new_size: db_mtx * dbuf_dirty: db_mtx * dbuf_findbp: (callers, phys? - the real need) * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?) * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp() * dnode_sync/w (increase_indirection): db_mtx (phys) * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*) * dnode_new_blkid/w: (dn_maxblkid) * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid) * dnode_next_offset: (phys) * * dn_dbufs_mtx * must be held before: * db_mtx, hash_mutexes * protects: * dn_dbufs * dn_evicted * held from: * dmu_evict_user: db_mtx (dn_dbufs) * dbuf_free_range: db_mtx (dn_dbufs) * dbuf_remove_ref: db_mtx, callees: * dbuf_hash_remove: hash_mutexes, db_mtx * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) * dnode_set_blksz: (dn_dbufs) * * hash_mutexes (global) * must be held before: * db_mtx * protects dbuf_hash_table (global) and db_hash_next * held from: * dbuf_find: db_mtx * dbuf_hash_insert: db_mtx * dbuf_hash_remove: db_mtx * * db_mtx (meta-leaf) * must be held before: * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) * protects: * db_state * db_holds * db_buf * db_changed * db_data_pending * db_dirtied * db_link * db_dirty_node (??) * db_dirtycnt * db_d.* * db.* * held from: * dbuf_dirty: dn_mtx, dn_dirty_mtx * dbuf_dirty->dsl_dir_willuse_space: dd_lock * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock * dbuf_undirty: dn_dirty_mtx (db_d) * dbuf_write_done: dn_dirty_mtx (db_state) * dbuf_* * dmu_buf_update_user: none (db_d) * dmu_evict_user: none (db_d) (maybe can eliminate) * dbuf_find: none (db_holds) * dbuf_hash_insert: none (db_holds) * dmu_buf_read_array_impl: none (db_state, db_changed) * dmu_sync: none (db_dirty_node, db_d) * dnode_reallocate: none (db) * * dn_mtx (leaf) * protects: * dn_dirty_dbufs * dn_ranges * phys accounting * dn_allocated_txg * dn_free_txg * dn_assigned_txg * dn_dirty_txg * dd_assigned_tx * dn_notxholds * dn_nodnholds * dn_dirtyctx * dn_dirtyctx_firstset * (dn_phys copy fields?) * (dn_phys contents?) * held from: * dnode_* * dbuf_dirty: none * dbuf_sync: none (phys accounting) * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) * dbuf_write_done: none (phys accounting) * dmu_object_info_from_dnode: none (accounting) * dmu_tx_commit: none * dmu_tx_hold_object_impl: none * dmu_tx_try_assign: dn_notxholds(cv) * dmu_tx_unassign: none * * dd_lock * must be held before: * ds_lock * ancestors' dd_lock * protects: * dd_prop_cbs * dd_sync_* * dd_used_bytes * dd_tempreserved * dd_space_towrite * dd_myname * dd_phys accounting? * held from: * dsl_dir_* * dsl_prop_changed_notify: none (dd_prop_cbs) * dsl_prop_register: none (dd_prop_cbs) * dsl_prop_unregister: none (dd_prop_cbs) * * os_lock (leaf) * protects: * os_dirty_dnodes * os_free_dnodes * os_dnodes * os_downgraded_dbufs * dn_dirtyblksz * dn_dirty_link * held from: * dnode_create: none (os_dnodes) * dnode_destroy: none (os_dnodes) * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) * dnode_free: none (dn_dirtyblksz, os_*_dnodes) * * ds_lock * protects: * ds_objset * ds_open_refcount * ds_snapname * ds_phys accounting * ds_phys userrefs zapobj * ds_reserved * held from: * dsl_dataset_* * * dr_mtx (leaf) * protects: * dr_children * held from: * dbuf_dirty * dbuf_undirty * dbuf_sync_indirect * dnode_new_blkid */ struct objset; struct dmu_pool; -typedef struct dmu_xuio { - int next; - int cnt; - struct arc_buf **bufs; - iovec_t *iovp; -} dmu_xuio_t; - typedef struct dmu_sendstatus { list_node_t dss_link; int dss_outfd; proc_t *dss_proc; offset_t *dss_off; uint64_t dss_blocks; /* blocks visited during the sending process */ } dmu_sendstatus_t; void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t, void *, dmu_buf_t **); #ifdef __cplusplus } #endif #endif /* _SYS_DMU_IMPL_H */ diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h index 482ffef6f473..99a5a4d2ab7b 100644 --- a/lib/libspl/include/sys/uio.h +++ b/lib/libspl/include/sys/uio.h @@ -1,152 +1,113 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ #ifndef _LIBSPL_SYS_UIO_H #define _LIBSPL_SYS_UIO_H #include #include_next #ifdef __APPLE__ #include #endif #include typedef struct iovec iovec_t; #if defined(__linux__) || defined(__APPLE__) typedef enum uio_rw { UIO_READ = 0, UIO_WRITE = 1, } uio_rw_t; typedef enum uio_seg { UIO_USERSPACE = 0, UIO_SYSSPACE = 1, UIO_USERISPACE = 2, } uio_seg_t; #elif defined(__FreeBSD__) typedef enum uio_seg uio_seg_t; #endif typedef struct uio { struct iovec *uio_iov; /* pointer to array of iovecs */ int uio_iovcnt; /* number of iovecs */ offset_t uio_loffset; /* file offset */ uio_seg_t uio_segflg; /* address space (kernel or user) */ uint16_t uio_fmode; /* file mode flags */ uint16_t uio_extflg; /* extended flags */ ssize_t uio_resid; /* residual count */ } uio_t; -typedef enum xuio_type { - UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY, -} xuio_type_t; - -#define UIOA_IOV_MAX 16 - -typedef struct uioa_page_s { /* locked uio_iov state */ - int uioa_pfncnt; /* count of pfn_t(s) in *uioa_ppp */ - void **uioa_ppp; /* page_t or pfn_t array */ - caddr_t uioa_base; /* address base */ - size_t uioa_len; /* span length */ -} uioa_page_t; - -typedef struct xuio { - uio_t xu_uio; /* embedded UIO structure */ - - /* Extended uio fields */ - enum xuio_type xu_type; /* uio type */ - union { - struct { - uint32_t xu_a_state; /* state of async i/o */ - ssize_t xu_a_mbytes; /* bytes moved */ - uioa_page_t *xu_a_lcur; /* uioa_locked[] pointer */ - void **xu_a_lppp; /* lcur->uioa_pppp[] pointer */ - void *xu_a_hwst[4]; /* opaque hardware state */ - uioa_page_t xu_a_locked[UIOA_IOV_MAX]; - } xu_aio; - - struct { - int xu_zc_rw; /* read or write buffer */ - void *xu_zc_priv; /* fs specific */ - } xu_zc; - } xu_ext; -} xuio_t; - -#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv -#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw - #define uio_segflg(uio) (uio)->uio_segflg #define uio_offset(uio) (uio)->uio_loffset #define uio_resid(uio) (uio)->uio_resid #define uio_iovcnt(uio) (uio)->uio_iovcnt #define uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len #define uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base static inline void uio_iov_at_index(uio_t *uio, uint_t idx, void **base, uint64_t *len) { *base = uio_iovbase(uio, idx); *len = uio_iovlen(uio, idx); } static inline void uio_advance(uio_t *uio, size_t size) { uio->uio_resid -= size; uio->uio_loffset += size; } static inline offset_t uio_index_at_offset(uio_t *uio, offset_t off, uint_t *vec_idx) { *vec_idx = 0; while (*vec_idx < (uint_t)uio_iovcnt(uio) && off >= (offset_t)uio_iovlen(uio, *vec_idx)) { off -= uio_iovlen(uio, *vec_idx); (*vec_idx)++; } return (off); } #endif /* _SYS_UIO_H */ diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 2469769bc78a..0b72c0356917 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1,4402 +1,4244 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) zrele() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call zrele() within a tx then use zfs_zrele_async(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* * Virus scanning is unsupported. It would be possible to add a hook * here to performance the required virus scan. This could be done * entirely in the kernel or potentially as an update to invoke a * scanning utility. */ static int zfs_vscan(struct inode *ip, cred_t *cr, int async) { return (0); } /* ARGSUSED */ int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Honor ZFS_APPENDONLY file attribute */ if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Virus scan eligible files on open */ if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (zfs_vscan(ip, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & O_SYNC) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ int zfs_close(struct inode *ip, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if (flag & O_SYNC) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(zfs_vscan(ip, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } #if defined(SEEK_HOLE) && defined(SEEK_DATA) /* * Lseek support for finding holes (cmd == SEEK_HOLE) and * data (cmd == SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey_common(struct inode *ip, int cmd, loff_t *off) { znode_t *zp = ITOZ(ip); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* file was dirty, so fall back to using generic logic */ if (error == EBUSY) { if (hole) *off = file_sz; return (0); } /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } int zfs_holey(struct inode *ip, int cmd, loff_t *off) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_holey_common(ip, cmd, off); ZFS_EXIT(zfsvfs); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ #if defined(_KERNEL) /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os, uint64_t oid) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; uint64_t nbytes; int64_t off; void *pb; off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { nbytes = MIN(PAGE_SIZE - off, len); pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); pb = kmap(pp); (void) dmu_read(os, oid, start+off, nbytes, pb+off, DMU_READ_PREFETCH); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); unlock_page(pp); put_page(pp); } len -= nbytes; off = 0; } } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ int mappedread(znode_t *zp, int nbytes, uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; int64_t start, off; uint64_t bytes; int len = nbytes; int error = 0; void *pb; start = uio->uio_loffset; off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { bytes = MIN(PAGE_SIZE - off, len); pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { ASSERT(PageUptodate(pp)); unlock_page(pp); pb = kmap(pp); error = uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } #endif /* _KERNEL */ unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; /* * Write the bytes to a file. * * IN: zp - znode of file to be written to * data - bytes to write * len - number of bytes to write * pos - offset to start writing at * * OUT: resid - remaining bytes to write * * RETURN: 0 if success * positive error code if failure * * Timestamps: * zp - ctime|mtime updated if byte count > 0 */ int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *resid) { ssize_t written; int error = 0; written = zpl_write_common(ZTOI(zp), data, len, &pos, UIO_SYSSPACE, 0, kcred); if (written < 0) { error = -written; } else if (resid == NULL) { if (written < len) error = SET_ERROR(EIO); /* short write */ } else { *resid = len - written; } return (error); } /* * Drop a reference on the passed inode asynchronously. This ensures * that the caller will never drop the last reference on an inode in * the current context. Doing so while holding open a tx could result * in a deadlock if iput_final() re-enters the filesystem code. */ void zfs_zrele_async(znode_t *zp) { struct inode *ip = ZTOI(zp); objset_t *os = ITOZSB(ip)->z_os; ASSERT(atomic_read(&ip->i_count) > 0); ASSERT(os != NULL); if (atomic_read(&ip->i_count) == 1) VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); else zrele(zp); } /* ARGSUSED */ static void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); kmem_free(zgd, sizeof (zgd_t)); } #ifdef ZFS_DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); return (SET_ERROR(ENOENT)); } zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef ZFS_DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; /* * TX_WRITE2 relies on the data previously * written by the TX_WRITE that caused * EALREADY. We zero out the BP because * it is the old, currently-on-disk BP. */ zgd->zgd_bp = NULL; BP_ZERO(bp); error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ int zfs_access(struct inode *ip, int mode, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held inode reference for it. * * IN: zdp - znode of directory to search. * nm - name of entry to lookup. * flags - LOOKUP_XATTR set if looking for an attribute. * cr - credentials of caller. * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: zpp - znode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ int zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ZTOZSB(zdp); int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (!S_ISDIR(ZTOI(zdp)->i_mode)) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *zpp = zdp; zhold(*zpp); return (0); } return (error); } } ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *zpp = NULL; if (flags & LOOKUP_XATTR) { /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, B_FALSE, cr))) { zrele(*zpp); *zpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } if (!S_ISDIR(ZTOI(zdp)->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTDIR)); } /* * Check accessibility of directory. */ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); if ((error == 0) && (*zpp)) zfs_inode_update(*zpp); ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the ip of the created or trunc'd file. * * IN: dzp - znode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - file flag. * vsecp - ACL to be set * * OUT: zpp - znode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated if new entry created * zp - ctime|mtime always, atime if new */ /* ARGSUSED */ int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } top: *zpp = NULL; if (*name == '\0') { /* * Null component name refers to the directory itself. */ zhold(dzp); zp = dzp; dl = NULL; error = 0; } else { /* possible igrab(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); ZFS_EXIT(zfsvfs); return (error); } } if (zp == NULL) { uint64_t txtype; uint64_t projid = ZFS_DEFAULT_PROJID; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EINVAL); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { /* * Since, we failed to add the directory entry for it, * delete the newly created dnode. */ zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & O_APPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); have_acl = B_FALSE; /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl) { error = SET_ERROR(EEXIST); goto out; } /* * Can't open a directory for writing. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if (S_ISREG(ZTOI(zp)->i_mode) && (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ if (dl) { zfs_dirent_unlock(dl); dl = NULL; } error = zfs_freesp(zp, 0, 0, mode, TRUE); } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) zrele(zp); } else { zfs_inode_update(dzp); zfs_inode_update(zp); *zpp = zp; } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* ARGSUSED */ int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) { znode_t *zp = NULL, *dzp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); objset_t *os; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; uint64_t projid = ZFS_DEFAULT_PROJID; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } top: *ipp = NULL; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* Add to unlinked set */ zp->z_unlinked = B_TRUE; zfs_unlinked_add(zp, tx); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); out: if (error) { if (zp) zrele(zp); } else { zfs_inode_update(dzp); zfs_inode_update(zp); *ipp = ZTOI(zp); } ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dzp - znode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime * ip - ctime (if nlink > 0) */ uint64_t null_xattr = 0; /*ARGSUSED*/ int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) { znode_t *zp; znode_t *xzp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; uint64_t links; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp))) { if (realnmp) pn_free(realnmp); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } /* * Need to use rmdir for removing directories. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EPERM); goto out; } mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && !(zp->z_is_mapped); mutex_exit(&zp->z_lock); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the inode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && atomic_read(&ZTOI(zp)->i_count) == 1 && !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; } if (delete_now) { if (xattr_obj_unlinked) { ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; clear_nlink(ZTOI(xzp)); links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } /* * Add to the unlinked set because a new reference could be * taken concurrently resulting in a deferred destruction. */ zfs_unlinked_add(zp, tx); mutex_exit(&zp->z_lock); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); zfs_inode_update(dzp); zfs_inode_update(zp); if (delete_now) zrele(zp); else zfs_zrele_async(zp); if (xzp) { zfs_inode_update(xzp); zfs_zrele_async(xzp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dzp using the name * provided. Return a pointer to the inserted directory. * * IN: dzp - znode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * flags - case flags. * vsecp - ACL to be set * * OUT: zpp - znode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime updated * zpp - ctime|mtime|atime updated */ /*ARGSUSED*/ int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t waited = B_FALSE; ASSERT(S_ISDIR(vap->va_mode)); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (dirname == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *zpp = NULL; if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); /* * Now put new name in parent dir. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); *zpp = zp; txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (error != 0) { zrele(zp); } else { zfs_inode_update(dzp); zfs_inode_update(zp); } ZFS_EXIT(zfsvfs); return (error); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dzp - znode of directory to remove from. * name - name of directory to be removed. * cwd - inode of current working directory. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated */ /*ARGSUSED*/ int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL))) { ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } if (!S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } if (zp == cwd) { error = SET_ERROR(EINVAL); goto out; } /* * Grab a lock on the directory to make sure that no one is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); goto top; } dmu_tx_abort(tx); zrele(zp); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, B_FALSE); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); out: zfs_dirent_unlock(dl); zfs_inode_update(dzp); zfs_inode_update(zp); zrele(zp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read directory entries from the given directory cursor position and emit * name and position for each entry. * * IN: ip - inode of directory to read. * ctx - directory entry context. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; zap_cursor_t zc; zap_attribute_t zap; int error; uint8_t prefetch; uint8_t type; int done = 0; uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) goto out; /* * Quit if directory has been removed (posix) */ if (zp->z_unlinked) goto out; error = 0; os = zfsvfs->z_os; offset = ctx->pos; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Transform to file-system independent format */ while (!done) { uint64_t objnum; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, &zap))) { if (error == ENOENT) break; else goto update; } /* * Allow multiple entries provided the first entry is * the object id. Non-zpl consumers may safely make * use of the additional space. * * XXX: This should be a feature flag for compatibility */ if (zap.za_integer_length != 8 || zap.za_num_integers == 0) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld, " "length = %d, num = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset, zap.za_integer_length, (u_longlong_t)zap.za_num_integers); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); type = ZFS_DIRENT_TYPE(zap.za_first_integer); } done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), objnum, type); if (done) break; /* Prefetch znode */ if (prefetch) { dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); } /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } ctx->pos = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ update: zap_cursor_fini(&zc); if (error == ENOENT) error = 0; out: ZFS_EXIT(zfsvfs); return (error); } /* * Get the basic file attributes and place them in the provided kstat * structure. The inode is assumed to be the authoritative source * for most of the attributes. However, the znode currently has the * authoritative atime, blksize, and block count. * * IN: ip - inode of file. * * OUT: sp - kstat values. * * RETURN: 0 (always succeeds) */ /* ARGSUSED */ int zfs_getattr_fast(struct inode *ip, struct kstat *sp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t blksize; u_longlong_t nblocks; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); generic_fillattr(ip, sp); /* * +1 link count for root inode with visible '.zfs' directory. */ if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) if (sp->nlink < ZFS_LINK_MAX) sp->nlink++; sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); sp->blksize = blksize; sp->blocks = nblocks; if (unlikely(zp->z_blksz == 0)) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ sp->blksize = zfsvfs->z_max_blksz; } mutex_exit(&zp->z_lock); /* * Required to prevent NFS client from detecting different inode * numbers of snapshot root dentry before and after snapshot mount. */ if (zfsvfs->z_issnap) { if (ip->i_sb->s_root->d_inode == ip) sp->ino = ZFSCTL_INO_SNAPDIRS - dmu_objset_id(zfsvfs->z_os); } ZFS_EXIT(zfsvfs); return (0); } /* * For the operation of changing file's user/group/project, we need to * handle not only the main object that is assigned to the file directly, * but also the ones that are used by the file via hidden xattr directory. * * Because the xattr directory may contains many EA entries, as to it may * be impossible to change all of them via the transaction of changing the * main object's user/group/project attributes. Then we have to change them * via other multiple independent transactions one by one. It may be not good * solution, but we have no better idea yet. */ static int zfs_setattr_dir(znode_t *dzp) { struct inode *dxip = ZTOI(dzp); struct inode *xip = NULL; zfsvfs_t *zfsvfs = ZTOZSB(dzp); objset_t *os = zfsvfs->z_os; zap_cursor_t zc; zap_attribute_t zap; zfs_dirlock_t *dl; znode_t *zp = NULL; dmu_tx_t *tx = NULL; uint64_t uid, gid; sa_bulk_attr_t bulk[4]; int count; int err; zap_cursor_init(&zc, os, dzp->z_id); while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { count = 0; if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { err = ENXIO; break; } err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, ZEXISTS, NULL, NULL); if (err == ENOENT) goto next; if (err) break; xip = ZTOI(zp); if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && zp->z_projid == dzp->z_projid) goto next; tx = dmu_tx_create(os); if (!(zp->z_pflags & ZFS_PROJID)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); err = dmu_tx_assign(tx, TXG_WAIT); if (err) break; mutex_enter(&dzp->z_lock); if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { xip->i_uid = dxip->i_uid; uid = zfs_uid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, sizeof (uid)); } if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { xip->i_gid = dxip->i_gid; gid = zfs_gid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, sizeof (gid)); } if (zp->z_projid != dzp->z_projid) { if (!(zp->z_pflags & ZFS_PROJID)) { zp->z_pflags |= ZFS_PROJID; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); } zp->z_projid = dzp->z_projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } mutex_exit(&dzp->z_lock); if (likely(count > 0)) { err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } tx = NULL; if (err != 0 && err != ENOENT) break; next: if (zp) { zrele(zp); zp = NULL; zfs_dirent_unlock(dl); } zap_cursor_advance(&zc); } if (tx) dmu_tx_abort(tx); if (zp) { zrele(zp); zfs_dirent_unlock(dl); } zap_cursor_fini(&zc); return (err == ENOENT ? 0 : err); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: zp - znode of file to be modified. * vap - new attribute values. * If ATTR_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ int zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t *tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2], atime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2 = 0; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; boolean_t handle_eadir = B_FALSE; sa_bulk_attr_t *bulk, *xattr_bulk; int count = 0, xattr_count = 0, bulks = 8; if (mask == 0) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); ip = ZTOI(zp); /* * If this is a xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); if (xoap != NULL && (mask & ATTR_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } } zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & ATTR_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); xva_init(tmpxvattr); bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { err = SET_ERROR(EPERM); goto out3; } if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { err = SET_ERROR(EPERM); goto out3; } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (ATTR_ATIME | ATTR_MTIME)) { if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { err = SET_ERROR(EOVERFLOW); goto out3; } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfs_is_readonly(zfsvfs)) { err = SET_ERROR(EROFS); goto out3; } /* * First validate permissions */ if (mask & ATTR_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); if (err) goto out3; /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) goto out3; } if (mask & (ATTR_ATIME|ATTR_MTIME) || ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (ATTR_UID|ATTR_GID)) { int idmask = (mask & (ATTR_UID|ATTR_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & ATTR_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both ATTR_UID and ATTR_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (ATTR_UID|ATTR_GID)) && take_owner && take_group) || ((idmask == ATTR_UID) && take_owner) || ((idmask == ATTR_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ (void) secpolicy_setid_clear(vap, cr); trim_mask = (mask & (ATTR_UID|ATTR_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & ATTR_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((!S_ISREG(ip->i_mode) && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); err = SET_ERROR(EPERM); goto out3; } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & ATTR_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(ip, vap, &oldva, cr); if (err) goto out3; trim_mask |= ATTR_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) goto out3; if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { handle_eadir = B_TRUE; err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); if (err) goto out2; } if (mask & ATTR_UID) { new_kuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_kuid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & ATTR_GID) { new_kgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_kgid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) zrele(attrzp); err = EDQUOT; goto out2; } } tx = dmu_tx_create(os); if (mask & ATTR_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = EPERM; goto out; } if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (ATTR_UID|ATTR_GID)) { if (mask & ATTR_UID) { ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); new_uid = zfs_uid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); } } if (mask & ATTR_GID) { ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); new_gid = zfs_gid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); } } if (!(mask & ATTR_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & ATTR_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = ZTOI(zp)->i_mode = new_mode; ASSERT3P(aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; ZFS_TIME_ENCODE(&ip->i_atime, atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( vap->va_mtime, ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } if (attrzp && mask) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & ATTR_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(S_ISREG(ip->i_mode)); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (attrzp) zrele(attrzp); if (err == ERESTART) goto top; } else { if (count > 0) err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); if (attrzp) { if (err2 == 0 && handle_eadir) err2 = zfs_setattr_dir(attrzp); zrele(attrzp); } zfs_inode_update(zp); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); ZFS_EXIT(zfsvfs); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) zfs_zrele_async(zl->zl_znode); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = ZTOZSB(zp)->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (SET_ERROR(EINVAL)); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(ZTOZSB(zp), oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdzp - Source directory containing the "old entry". * snm - Old entry name. * tdzp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdzp,tdzp - ctime|mtime updated */ /*ARGSUSED*/ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, cred_t *cr, int flags) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); zilog_t *zilog; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0; int zflg = 0; boolean_t waited = B_FALSE; if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(sdzp); zilog = zfsvfs->z_log; ZFS_VERIFY_ZP(tdzp); /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || zfsctl_is_node(ZTOI(tdzp))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ ZFS_EXIT(zfsvfs); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) zrele(tzp); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(snm, "..") == 0) serr = EINVAL; ZFS_EXIT(zfsvfs); return (serr); } if (terr) { zfs_dirent_unlock(sdl); zrele(szp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = EINVAL; ZFS_EXIT(zfsvfs); return (terr); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto out; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) goto out; if (S_ISDIR(ZTOI(szp)->i_mode)) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) goto out; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if (S_ISDIR(ZTOI(szp)->i_mode)) { if (!S_ISDIR(ZTOI(tzp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } } else { if (S_ISDIR(ZTOI(tzp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); goto top; } dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); ZFS_EXIT(zfsvfs); return (error); } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; if (tdzp->z_pflags & ZFS_PROJINHERIT) szp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL), ==, 0); } } else { /* * If we had removed the existing target, subsequent * call to zfs_link_create() to add back the same entry * but, the new dnode (szp) should not fail. */ ASSERT(tzp == NULL); } } dmu_tx_commit(tx); out: if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); zfs_inode_update(sdzp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (sdzp != tdzp) zfs_inode_update(tdzp); zfs_inode_update(szp); zrele(szp); if (tzp) { zfs_inode_update(tzp); zrele(tzp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dzp - Directory to contain new symbolic link. * name - Name of directory entry in dip. * vap - Attributes of new entry. * link - Name for new symlink entry. * cr - credentials of caller. * flags - case flags * * OUT: zpp - Znode for new symbolic link. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dip - ctime|mtime updated */ /*ARGSUSED*/ int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, znode_t **zpp, cred_t *cr, int flags) { znode_t *zp; zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; boolean_t waited = B_FALSE; ASSERT(S_ISLNK(vap->va_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } top: *zpp = NULL; /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_inode_update(dzp); zfs_inode_update(zp); } zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); } else { zrele(zp); } ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by ip. * * IN: ip - inode of symbolic link * uio - structure to contain the link path. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated */ /* ARGSUSED */ int zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdzp referencing szp. * * IN: tdzp - Directory to contain new entry. * szp - znode of new entry. * name - name of new entry. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * tdzp - ctime|mtime updated * szp - ctime updated */ /* ARGSUSED */ int zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, int flags) { struct inode *sip = ZTOI(szp); znode_t *tzp; zfsvfs_t *zfsvfs = ZTOZSB(tdzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zf = ZNEW; uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; boolean_t is_tmpfile = 0; uint64_t txg; #ifdef HAVE_TMPFILE is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); #endif ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(tdzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (S_ISDIR(sip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } ZFS_VERIFY_ZP(szp); /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); if (is_tmpfile) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, tdzp); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* unmark z_unlinked so zfs_link_create will not reject */ if (is_tmpfile) szp->z_unlinked = B_FALSE; error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; /* * tmpfile is created to be in z_unlinkedobj, so remove it. * Also, we don't log in ZIL, because all previous file * operation on the tmpfile are ignored by ZIL. Instead we * always wait for txg to sync to make sure all previous * operation are sync safe. */ if (is_tmpfile) { VERIFY(zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, tdzp, szp, name); } } else if (is_tmpfile) { /* restore z_unlinked since when linking failed */ szp->z_unlinked = B_TRUE; } txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); zfs_inode_update(tdzp); zfs_inode_update(szp); ZFS_EXIT(zfsvfs); return (error); } static void zfs_putpage_commit_cb(void *arg) { struct page *pp = arg; ClearPageError(pp); end_page_writeback(pp); } /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. * * IN: ip - page mapped for inode. * pp - page to push (page is locked) * wbc - writeback control data * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated */ /* ARGSUSED */ int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t offset; loff_t pgoff; unsigned int pglen; dmu_tx_t *tx; caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); ASSERT(PageLocked(pp)); pgoff = page_offset(pp); /* Page byte-offset in file */ offset = i_size_read(ip); /* File length in bytes */ pglen = MIN(PAGE_SIZE, /* Page length in bytes */ P2ROUNDUP(offset, PAGE_SIZE)-pgoff); /* Page is beyond end of file */ if (pgoff >= offset) { unlock_page(pp); ZFS_EXIT(zfsvfs); return (0); } /* Truncate page length to end of file */ if (pgoff + pglen > offset) pglen = offset - pgoff; #if 0 /* * FIXME: Allow mmap writes past its quota. The correct fix * is to register a page_mkwrite() handler to count the page * against its quota when it is about to be dirtied. */ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, KUID_TO_SUID(ip->i_uid)) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, KGID_TO_SGID(ip->i_gid)) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { err = EDQUOT; } #endif /* * The ordering here is critical and must adhere to the following * rules in order to avoid deadlocking in either zfs_read() or * zfs_free_range() due to a lock inversion. * * 1) The page must be unlocked prior to acquiring the range lock. * This is critical because zfs_read() calls find_lock_page() * which may block on the page lock while holding the range lock. * * 2) Before setting or clearing write back on a page the range lock * must be held in order to prevent a lock inversion with the * zfs_free_range() function. * * This presents a problem because upon entering this function the * page lock is already held. To safely acquire the range lock the * page lock must be dropped. This creates a window where another * process could truncate, invalidate, dirty, or write out the page. * * Therefore, after successfully reacquiring the range and page locks * the current page state is checked. In the common case everything * will be as is expected and it can be written out. However, if * the page state has changed it must be handled accordingly. */ mapping = pp->mapping; redirty_page_for_writepage(wbc, pp); unlock_page(pp); zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, pgoff, pglen, RL_WRITER); lock_page(pp); /* Page mapping changed or it was no longer dirty, we're done */ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } /* Another process started write block if required */ if (PageWriteback(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(pp)) wait_on_page_bit(pp, PG_writeback); } ZFS_EXIT(zfsvfs); return (0); } /* Clear the dirty flag the required locks are held */ if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } /* * Counterpart for redirty_page_for_writepage() above. This page * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; set_page_writeback(pp); unlock_page(pp); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) dmu_tx_wait(tx); dmu_tx_abort(tx); __set_page_dirty_nobuffers(pp); ClearPageError(pp); end_page_writeback(pp); zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (err); } va = kmap(pp); ASSERT3U(pglen, <=, PAGE_SIZE); dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); kunmap(pp); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, zfs_putpage_commit_cb, pp); dmu_tx_commit(tx); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Note that this is rarely called under writepages(), because * writepages() normally handles the entire commit for * performance reasons. */ zil_commit(zfsvfs->z_log, zp->z_id); } ZFS_EXIT(zfsvfs); return (err); } /* * Update the system attributes when the inode has been dirtied. For the * moment we only update the mode, atime, mtime, and ctime. */ int zfs_dirty_inode(struct inode *ip, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); #ifdef I_DIRTY_TIME /* * This is the lazytime semantic introduced in Linux 4.0 * This flag will only be called from update_time when lazytime is set. * (Note, I_DIRTY_SYNC will also set if not lazytime) * Fortunately mtime and ctime are managed within ZFS itself, so we * only need to dirty atime. */ if (flags == I_DIRTY_TIME) { zp->z_atime_dirty = B_TRUE; goto out; } #endif tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } mutex_enter(&zp->z_lock); zp->z_atime_dirty = B_FALSE; SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_atime, atime); ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); mode = ip->i_mode; zp->z_mode = mode; error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); mutex_exit(&zp->z_lock); dmu_tx_commit(tx); out: ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t atime[2]; int error; int need_unlock = 0; /* Only read lock if we haven't already write locked, e.g. rollback */ if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { need_unlock = 1; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); } if (zp->z_sa_hdl == NULL) { if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); return; } if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { ZFS_TIME_ENCODE(&ip->i_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); zp->z_atime_dirty = B_FALSE; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * Fill pages with data from the disk. */ static int zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; struct page *cur_pp; u_offset_t io_off, total; size_t io_len; loff_t i_size; unsigned page_idx; int err; os = zfsvfs->z_os; io_len = nr_pages << PAGE_SHIFT; i_size = i_size_read(ip); io_off = page_offset(pl[0]); if (io_off + io_len > i_size) io_len = i_size - io_off; /* * Iterate over list of pages and read each page individually. */ page_idx = 0; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { caddr_t va; cur_pp = pl[page_idx++]; va = kmap(cur_pp); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, DMU_READ_PREFETCH); kunmap(cur_pp); if (err) { /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } } return (0); } /* * Uses zfs_fillpage to read data from the file and fill the pages. * * IN: ip - inode of file to get data from. * pl - list of pages to read * nr_pages - number of pages to read * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int err; if (pl == NULL) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); err = zfs_fillpage(ip, pl, nr_pages); ZFS_EXIT(zfsvfs); return (err); } /* * Check ZFS specific permissions to memory map a section of a file. * * IN: ip - inode of the file to mmap * off - file offset * addrp - start address in memory region * len - length of memory region * vm_flags- address flags * * RETURN: 0 if success * error code if failure */ /*ARGSUSED*/ int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((vm_flags & VM_WRITE) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((vm_flags & (VM_READ | VM_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } if (off < 0 || len > MAXOFFSET_T - off) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENXIO)); } ZFS_EXIT(zfsvfs); return (0); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: zp - znode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * zp - ctime|mtime updated */ /* ARGSUSED */ int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t off, len; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (cmd != F_FREESP) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = SHORT_FID_LEN; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); ZFS_EXIT(zfsvfs); return (0); } -#ifdef HAVE_UIO_ZEROCOPY -/* - * The smallest read we may consider to loan out an arcbuf. - * This must be a power of 2. - */ -int zcr_blksz_min = (1 << 10); /* 1K */ -/* - * If set to less than the file block size, allow loaning out of an - * arcbuf for a partial block read. This must be a power of 2. - */ -int zcr_blksz_max = (1 << 17); /* 128K */ - - -/*ARGSUSED*/ -static int -zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int max_blksz = zfsvfs->z_max_blksz; - uio_t *uio = &xuio->xu_uio; - ssize_t size = uio->uio_resid; - offset_t offset = uio->uio_loffset; - int blksz; - int fullblk, i; - arc_buf_t *abuf; - ssize_t maxsize; - int preamble, postamble; - - if (xuio->xu_type != UIOTYPE_ZEROCOPY) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - switch (ioflag) { - case UIO_WRITE: - /* - * Loan out an arc_buf for write if write size is bigger than - * max_blksz, and the file's block size is also max_blksz. - */ - blksz = max_blksz; - if (size < blksz || zp->z_blksz != blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - /* - * Caller requests buffers for write before knowing where the - * write offset might be (e.g. NFS TCP write). - */ - if (offset == -1) { - preamble = 0; - } else { - preamble = P2PHASE(offset, blksz); - if (preamble) { - preamble = blksz - preamble; - size -= preamble; - } - } - - postamble = P2PHASE(size, blksz); - size -= postamble; - - fullblk = size / blksz; - (void) dmu_xuio_init(xuio, - (preamble != 0) + fullblk + (postamble != 0)); - - /* - * Have to fix iov base/len for partial buffers. They - * currently represent full arc_buf's. - */ - if (preamble) { - /* data begins in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, - blksz - preamble, preamble); - } - - for (i = 0; i < fullblk; i++) { - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, 0, blksz); - } - - if (postamble) { - /* data ends in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, 0, postamble); - } - break; - case UIO_READ: - /* - * Loan out an arc_buf for read if the read size is larger than - * the current file block size. Block alignment is not - * considered. Partial arc_buf will be loaned out for read. - */ - blksz = zp->z_blksz; - if (blksz < zcr_blksz_min) - blksz = zcr_blksz_min; - if (blksz > zcr_blksz_max) - blksz = zcr_blksz_max; - /* avoid potential complexity of dealing with it */ - if (blksz > max_blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - maxsize = zp->z_size - uio->uio_loffset; - if (size > maxsize) - size = maxsize; - - if (size < blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - break; - default: - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - uio->uio_extflg = UIO_XUIO; - XUIO_XUZC_RW(xuio) = ioflag; - ZFS_EXIT(zfsvfs); - return (0); -} - -/*ARGSUSED*/ -static int -zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr) -{ - int i; - arc_buf_t *abuf; - int ioflag = XUIO_XUZC_RW(xuio); - - ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); - - i = dmu_xuio_cnt(xuio); - while (i-- > 0) { - abuf = dmu_xuio_arcbuf(xuio, i); - /* - * if abuf == NULL, it must be a write buffer - * that has been returned in zfs_write(). - */ - if (abuf) - dmu_return_arcbuf(abuf); - ASSERT(abuf || ioflag == UIO_WRITE); - } - - dmu_xuio_fini(xuio); - return (0); -} -#endif /* HAVE_UIO_ZEROCOPY */ - #if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); EXPORT_SYMBOL(zfs_access); EXPORT_SYMBOL(zfs_lookup); EXPORT_SYMBOL(zfs_create); EXPORT_SYMBOL(zfs_tmpfile); EXPORT_SYMBOL(zfs_remove); EXPORT_SYMBOL(zfs_mkdir); EXPORT_SYMBOL(zfs_rmdir); EXPORT_SYMBOL(zfs_readdir); EXPORT_SYMBOL(zfs_getattr_fast); EXPORT_SYMBOL(zfs_setattr); EXPORT_SYMBOL(zfs_rename); EXPORT_SYMBOL(zfs_symlink); EXPORT_SYMBOL(zfs_readlink); EXPORT_SYMBOL(zfs_link); EXPORT_SYMBOL(zfs_inactive); EXPORT_SYMBOL(zfs_space); EXPORT_SYMBOL(zfs_fid); EXPORT_SYMBOL(zfs_getpage); EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); EXPORT_SYMBOL(zfs_map); /* BEGIN CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); /* END CSTYLED */ #endif diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 9b6481d22322..82bb7206c1ac 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1,4774 +1,4772 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include kstat_t *dbuf_ksp; typedef struct dbuf_stats { /* * Various statistics about the size of the dbuf cache. */ kstat_named_t cache_count; kstat_named_t cache_size_bytes; kstat_named_t cache_size_bytes_max; /* * Statistics regarding the bounds on the dbuf cache size. */ kstat_named_t cache_target_bytes; kstat_named_t cache_lowater_bytes; kstat_named_t cache_hiwater_bytes; /* * Total number of dbuf cache evictions that have occurred. */ kstat_named_t cache_total_evicts; /* * The distribution of dbuf levels in the dbuf cache and * the total size of all dbufs at each level. */ kstat_named_t cache_levels[DN_MAX_LEVELS]; kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; /* * Statistics about the dbuf hash table. */ kstat_named_t hash_hits; kstat_named_t hash_misses; kstat_named_t hash_collisions; kstat_named_t hash_elements; kstat_named_t hash_elements_max; /* * Number of sublists containing more than one dbuf in the dbuf * hash table. Keep track of the longest hash chain. */ kstat_named_t hash_chains; kstat_named_t hash_chain_max; /* * Number of times a dbuf_create() discovers that a dbuf was * already created and in the dbuf hash table. */ kstat_named_t hash_insert_race; /* * Statistics about the size of the metadata dbuf cache. */ kstat_named_t metadata_cache_count; kstat_named_t metadata_cache_size_bytes; kstat_named_t metadata_cache_size_bytes_max; /* * For diagnostic purposes, this is incremented whenever we can't add * something to the metadata cache because it's full, and instead put * the data in the regular dbuf cache. */ kstat_named_t metadata_cache_overflow; } dbuf_stats_t; dbuf_stats_t dbuf_stats = { { "cache_count", KSTAT_DATA_UINT64 }, { "cache_size_bytes", KSTAT_DATA_UINT64 }, { "cache_size_bytes_max", KSTAT_DATA_UINT64 }, { "cache_target_bytes", KSTAT_DATA_UINT64 }, { "cache_lowater_bytes", KSTAT_DATA_UINT64 }, { "cache_hiwater_bytes", KSTAT_DATA_UINT64 }, { "cache_total_evicts", KSTAT_DATA_UINT64 }, { { "cache_levels_N", KSTAT_DATA_UINT64 } }, { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, { "hash_hits", KSTAT_DATA_UINT64 }, { "hash_misses", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "hash_insert_race", KSTAT_DATA_UINT64 }, { "metadata_cache_count", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, { "metadata_cache_overflow", KSTAT_DATA_UINT64 } }; #define DBUF_STAT_INCR(stat, val) \ atomic_add_64(&dbuf_stats.stat.value.ui64, (val)); #define DBUF_STAT_DECR(stat, val) \ DBUF_STAT_INCR(stat, -(val)); #define DBUF_STAT_BUMP(stat) \ DBUF_STAT_INCR(stat, 1); #define DBUF_STAT_BUMPDOWN(stat) \ DBUF_STAT_INCR(stat, -1); #define DBUF_STAT_MAX(stat, v) { \ uint64_t _m; \ while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\ continue; \ } static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags); extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp); /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; static kthread_t *dbuf_cache_evict_thread; static kmutex_t dbuf_evict_lock; static kcondvar_t dbuf_evict_cv; static boolean_t dbuf_evict_thread_exit; /* * There are two dbuf caches; each dbuf can only be in one of them at a time. * * 1. Cache of metadata dbufs, to help make read-heavy administrative commands * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs * that represent the metadata that describes filesystems/snapshots/ * bookmarks/properties/etc. We only evict from this cache when we export a * pool, to short-circuit as much I/O as possible for all administrative * commands that need the metadata. There is no eviction policy for this * cache, because we try to only include types in it which would occupy a * very small amount of space per object but create a large impact on the * performance of these commands. Instead, after it reaches a maximum size * (which should only happen on very small memory systems with a very large * number of filesystem objects), we stop taking new dbufs into the * metadata cache, instead putting them in the normal dbuf cache. * * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that * are not currently held but have been recently released. These dbufs * are not eligible for arc eviction until they are aged out of the cache. * Dbufs that are aged out of the cache will be immediately destroyed and * become eligible for arc eviction. * * Dbufs are added to these caches once the last hold is released. If a dbuf is * later accessed and still exists in the dbuf cache, then it will be removed * from the cache and later re-added to the head of the cache. * * If a given dbuf meets the requirements for the metadata cache, it will go * there, otherwise it will be considered for the generic LRU dbuf cache. The * caches and the refcounts tracking their sizes are stored in an array indexed * by those caches' matching enum values (from dbuf_cached_state_t). */ typedef struct dbuf_cache { multilist_t *cache; zfs_refcount_t size; } dbuf_cache_t; dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; /* Size limits for the caches */ unsigned long dbuf_cache_max_bytes = ULONG_MAX; unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX; /* Set the default sizes of the caches to log2 fraction of arc size */ int dbuf_cache_shift = 5; int dbuf_metadata_cache_shift = 6; static unsigned long dbuf_cache_target_bytes(void); static unsigned long dbuf_metadata_cache_target_bytes(void); /* * The LRU dbuf cache uses a three-stage eviction policy: * - A low water marker designates when the dbuf eviction thread * should stop evicting from the dbuf cache. * - When we reach the maximum size (aka mid water mark), we * signal the eviction thread to run. * - The high water mark indicates when the eviction thread * is unable to keep up with the incoming load and eviction must * happen in the context of the calling thread. * * The dbuf cache: * (max size) * low water mid water hi water * +----------------------------------------+----------+----------+ * | | | | * | | | | * | | | | * | | | | * +----------------------------------------+----------+----------+ * stop signal evict * evicting eviction directly * thread * * The high and low water marks indicate the operating range for the eviction * thread. The low water mark is, by default, 90% of the total size of the * cache and the high water mark is at 110% (both of these percentages can be * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, * respectively). The eviction thread will try to ensure that the cache remains * within this range by waking up every second and checking if the cache is * above the low water mark. The thread can also be woken up by callers adding * elements into the cache if the cache is larger than the mid water (i.e max * cache size). Once the eviction thread is woken up and eviction is required, * it will continue evicting buffers until it's able to reduce the cache size * to the low water mark. If the cache size continues to grow and hits the high * water mark, then callers adding elements to the cache will begin to evict * directly from the cache until the cache is no longer above the high water * mark. */ /* * The percentage above and below the maximum cache size. */ uint_t dbuf_cache_hiwater_pct = 10; uint_t dbuf_cache_lowater_pct = 10; /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) { dmu_buf_impl_t *db = vdb; bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); zfs_refcount_create(&db->db_holds); return (0); } /* ARGSUSED */ static void dbuf_dest(void *vdb, void *unused) { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); rw_destroy(&db->db_rwlock); cv_destroy(&db->db_changed); ASSERT(!multilist_link_active(&db->db_cache_link)); zfs_refcount_destroy(&db->db_holds); } /* * dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; static uint64_t dbuf_hash_count; /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) { return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid)); } #define DTRACE_SET_STATE(db, why) \ DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db, \ const char *, why) #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ (dbuf)->db_level == (level) && \ (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv; uint64_t idx; dmu_buf_impl_t *db; hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } static dmu_buf_impl_t * dbuf_find_bonus(objset_t *os, uint64_t object) { dnode_t *dn; dmu_buf_impl_t *db = NULL; if (dnode_hold(os, object, FTAG, &dn) == 0) { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus != NULL) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } return (db); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid, hv, idx; dmu_buf_impl_t *dbf; uint32_t i; blkid = db->db_blkid; hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx], i = 0; dbf != NULL; dbf = dbf->db_hash_next, i++) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); } } if (i > 0) { DBUF_STAT_BUMP(hash_collisions); if (i == 1) DBUF_STAT_BUMP(hash_chains); DBUF_STAT_MAX(hash_chain_max, i); } mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_inc_64(&dbuf_hash_count); DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count); return (NULL); } /* * This returns whether this dbuf should be stored in the metadata cache, which * is based on whether it's from one of the dnode types that store data related * to traversing dataset hierarchies. */ static boolean_t dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) { DB_DNODE_ENTER(db); dmu_object_type_t type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); /* Check if this dbuf is one of the types we care about */ if (DMU_OT_IS_METADATA_CACHED(type)) { /* If we hit this, then we set something up wrong in dmu_ot */ ASSERT(DMU_OT_IS_METADATA(type)); /* * Sanity check for small-memory systems: don't allocate too * much memory for this purpose. */ if (zfs_refcount_count( &dbuf_caches[DB_DBUF_METADATA_CACHE].size) > dbuf_metadata_cache_target_bytes()) { DBUF_STAT_BUMP(metadata_cache_overflow); return (B_FALSE); } return (B_TRUE); } return (B_FALSE); } /* * Remove an entry from the hash table. It must be in the EVICTING state. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv, idx; dmu_buf_impl_t *dbf, **dbp; hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); idx = hv & h->hash_table_mask; /* * We mustn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; ASSERT(dbf != NULL); } *dbp = db->db_hash_next; db->db_hash_next = NULL; if (h->hash_table[idx] && h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_hash_count); } typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING } dbvu_verify_type_t; static void dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) { #ifdef ZFS_DEBUG int64_t holds; if (db->db_user == NULL) return; /* Only data blocks support the attachment of user data. */ ASSERT(db->db_level == 0); /* Clients must resolve a dbuf before attaching user data. */ ASSERT(db->db.db_data != NULL); ASSERT3U(db->db_state, ==, DB_CACHED); holds = zfs_refcount_count(&db->db_holds); if (verify_type == DBVU_EVICTING) { /* * Immediate eviction occurs when holds == dirtycnt. * For normal eviction buffers, holds is zero on * eviction, except when dbuf_fix_old_data() calls * dbuf_clear_data(). However, the hold count can grow * during eviction even though db_mtx is held (see * dmu_bonus_hold() for an example), so we can only * test the generic invariant that holds >= dirtycnt. */ ASSERT3U(holds, >=, db->db_dirtycnt); } else { if (db->db_user_immediate_evict == TRUE) ASSERT3U(holds, >=, db->db_dirtycnt); else ASSERT3U(holds, >, 0); } #endif } static void dbuf_evict_user(dmu_buf_impl_t *db) { dmu_buf_user_t *dbu = db->db_user; ASSERT(MUTEX_HELD(&db->db_mtx)); if (dbu == NULL) return; dbuf_verify_user(db, DBVU_EVICTING); db->db_user = NULL; #ifdef ZFS_DEBUG if (dbu->dbu_clear_on_evict_dbufp != NULL) *dbu->dbu_clear_on_evict_dbufp = NULL; #endif /* * There are two eviction callbacks - one that we call synchronously * and one that we invoke via a taskq. The async one is useful for * avoiding lock order reversals and limiting stack depth. * * Note that if we have a sync callback but no async callback, * it's likely that the sync callback will free the structure * containing the dbu. In that case we need to take care to not * dereference dbu after calling the sync evict func. */ boolean_t has_async = (dbu->dbu_evict_func_async != NULL); if (dbu->dbu_evict_func_sync != NULL) dbu->dbu_evict_func_sync(dbu); if (has_async) { taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, dbu, 0, &dbu->dbu_tqent); } } boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { /* * Consider indirect blocks and spill blocks to be meta data. */ if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) { return (B_TRUE); } else { boolean_t is_metadata; DB_DNODE_ENTER(db); is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); } } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the dbuf eviction * code is laid out; dbuf_evict_thread() assumes dbufs are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ static unsigned int dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { dmu_buf_impl_t *db = obj; /* * The assumption here, is the hash value for a given * dmu_buf_impl_t will remain constant throughout it's lifetime * (i.e. it's objset, object, level and blkid fields don't change). * Thus, we don't need to store the dbuf's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. */ return (dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid) % multilist_get_num_sublists(ml)); } /* * The target size of the dbuf cache can grow with the ARC target, * unless limited by the tunable dbuf_cache_max_bytes. */ static inline unsigned long dbuf_cache_target_bytes(void) { return (MIN(dbuf_cache_max_bytes, arc_target_bytes() >> dbuf_cache_shift)); } /* * The target size of the dbuf metadata cache can grow with the ARC target, * unless limited by the tunable dbuf_metadata_cache_max_bytes. */ static inline unsigned long dbuf_metadata_cache_target_bytes(void) { return (MIN(dbuf_metadata_cache_max_bytes, arc_target_bytes() >> dbuf_metadata_cache_shift)); } static inline uint64_t dbuf_cache_hiwater_bytes(void) { uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); return (dbuf_cache_target + (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100); } static inline uint64_t dbuf_cache_lowater_bytes(void) { uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); return (dbuf_cache_target - (dbuf_cache_target * dbuf_cache_lowater_pct) / 100); } static inline boolean_t dbuf_cache_above_lowater(void) { return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_lowater_bytes()); } /* * Evict the oldest eligible dbuf from the dbuf cache. */ static void dbuf_evict_one(void) { int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); multilist_sublist_t *mls = multilist_sublist_lock( dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); dmu_buf_impl_t *db = multilist_sublist_tail(mls); while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { db = multilist_sublist_prev(mls, db); } DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, multilist_sublist_t *, mls); if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); (void) zfs_refcount_remove_many( &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); DBUF_STAT_BUMP(cache_total_evicts); } else { multilist_sublist_unlock(mls); } } /* * The dbuf evict thread is responsible for aging out dbufs from the * cache. Once the cache has reached it's maximum size, dbufs are removed * and destroyed. The eviction thread will continue running until the size * of the dbuf cache is at or below the maximum size. Once the dbuf is aged * out of the cache it is destroyed and becomes eligible for arc eviction. */ /* ARGSUSED */ static void dbuf_evict_thread(void *unused) { callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); mutex_enter(&dbuf_evict_lock); while (!dbuf_evict_thread_exit) { while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_idle_hires(&dbuf_evict_cv, &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); /* * Keep evicting as long as we're above the low water mark * for the cache. We do this without holding the locks to * minimize lock contention. */ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { dbuf_evict_one(); } mutex_enter(&dbuf_evict_lock); } dbuf_evict_thread_exit = B_FALSE; cv_broadcast(&dbuf_evict_cv); CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ thread_exit(); } /* * Wake up the dbuf eviction thread if the dbuf cache is at its max size. * If the dbuf cache is at its high water mark, then evict a dbuf from the * dbuf cache using the callers context. */ static void dbuf_evict_notify(uint64_t size) { /* * We check if we should evict without holding the dbuf_evict_lock, * because it's OK to occasionally make the wrong decision here, * and grabbing the lock results in massive lock contention. */ if (size > dbuf_cache_target_bytes()) { if (size > dbuf_cache_hiwater_bytes()) dbuf_evict_one(); cv_signal(&dbuf_evict_cv); } } static int dbuf_kstat_update(kstat_t *ksp, int rw) { dbuf_stats_t *ds = ksp->ks_data; if (rw == KSTAT_WRITE) { return (SET_ERROR(EACCES)); } else { ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count( &dbuf_caches[DB_DBUF_METADATA_CACHE].size); ds->cache_size_bytes.value.ui64 = zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); ds->hash_elements.value.ui64 = dbuf_hash_count; } return (0); } void dbuf_init(void) { uint64_t hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; int i; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE) hsize <<= 1; retry: h->hash_table_mask = hsize - 1; #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_alloc() in the linux kernel */ h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); #else h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); #endif if (h->hash_table == NULL) { /* XXX - we should really return an error instead of assert */ ASSERT(hsize > (1ULL << 10)); hsize >>= 1; goto retry; } dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { dbuf_caches[dcs].cache = multilist_create(sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_cache_link), dbuf_cache_multilist_index_func); zfs_refcount_create(&dbuf_caches[dcs].size); } dbuf_evict_thread_exit = B_FALSE; mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, NULL, 0, &p0, TS_RUN, minclsyspri); dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc", KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dbuf_ksp != NULL) { for (i = 0; i < DN_MAX_LEVELS; i++) { snprintf(dbuf_stats.cache_levels[i].name, KSTAT_STRLEN, "cache_level_%d", i); dbuf_stats.cache_levels[i].data_type = KSTAT_DATA_UINT64; snprintf(dbuf_stats.cache_levels_bytes[i].name, KSTAT_STRLEN, "cache_level_%d_bytes", i); dbuf_stats.cache_levels_bytes[i].data_type = KSTAT_DATA_UINT64; } dbuf_ksp->ks_data = &dbuf_stats; dbuf_ksp->ks_update = dbuf_kstat_update; kstat_install(dbuf_ksp); } } void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; int i; dbuf_stats_destroy(); for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_free() in the linux kernel */ vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); #else kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); #endif kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); mutex_enter(&dbuf_evict_lock); dbuf_evict_thread_exit = B_TRUE; while (dbuf_evict_thread_exit) { cv_signal(&dbuf_evict_cv); cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); mutex_destroy(&dbuf_evict_lock); cv_destroy(&dbuf_evict_cv); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { zfs_refcount_destroy(&dbuf_caches[dcs].size); multilist_destroy(dbuf_caches[dcs].cache); } if (dbuf_ksp != NULL) { kstat_delete(dbuf_ksp); dbuf_ksp = NULL; } } /* * Other stuff. */ #ifdef ZFS_DEBUG static void dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; uint32_t txg_prev; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) return; ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } if ((dr = list_head(&db->db_dirty_records)) != NULL) { ASSERT(dr->dr_dbuf == db); txg_prev = dr->dr_txg; for (dr = list_next(&db->db_dirty_records, dr); dr != NULL; dr = list_next(&db->db_dirty_records, dr)) { ASSERT(dr->dr_dbuf == db); ASSERT(txg_prev > dr->dr_txg); txg_prev = dr->dr_txg; } } /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. */ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ if (db->db_blkptr) { if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ int epb __maybe_unused = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't * have the parent's rwlock. XXX indblksz no longer * grows. safe to do this now? */ if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); } } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. * * There is an exception to this rule for indirect blocks; in * this case, if the indirect block is a hole, we fill in a few * fields on each of the child blocks (importantly, birth time) * to prevent hole birth times from being lost when you * partially fill in a hole. */ if (db->db_dirtycnt == 0) { if (db->db_level == 0) { uint64_t *buf = db->db.db_data; int i; for (i = 0; i < db->db.db_size >> 3; i++) { ASSERT(buf[i] == 0); } } else { blkptr_t *bps = db->db.db_data; ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, db->db.db_size); /* * We want to verify that all the blkptrs in the * indirect block are holes, but we may have * automatically set up a few fields for them. * We iterate through each blkptr and verify * they only have those fields set. */ for (int i = 0; i < db->db.db_size / sizeof (blkptr_t); i++) { blkptr_t *bp = &bps[i]; ASSERT(ZIO_CHECKSUM_IS_ZERO( &bp->blk_cksum)); ASSERT( DVA_IS_EMPTY(&bp->blk_dva[0]) && DVA_IS_EMPTY(&bp->blk_dva[1]) && DVA_IS_EMPTY(&bp->blk_dva[2])); ASSERT0(bp->blk_fill); ASSERT0(bp->blk_pad[0]); ASSERT0(bp->blk_pad[1]); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(BP_IS_HOLE(bp)); ASSERT0(bp->blk_phys_birth); } } } } DB_DNODE_EXIT(db); } #endif static void dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) { db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "clear data"); } } static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; } static arc_buf_t * dbuf_alloc_arcbuf_from_arcbuf(dmu_buf_impl_t *db, arc_buf_t *data) { objset_t *os = db->db_objset; spa_t *spa = os->os_spa; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); enum zio_compress compress_type; uint8_t complevel; int psize, lsize; psize = arc_buf_size(data); lsize = arc_buf_lsize(data); compress_type = arc_get_compression(data); complevel = arc_get_complevel(data); if (arc_is_encrypted(data)) { boolean_t byteorder; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; dnode_t *dn = DB_DNODE(db); arc_get_raw_params(data, &byteorder, salt, iv, mac); data = arc_alloc_raw_buf(spa, db, dmu_objset_id(os), byteorder, salt, iv, mac, dn->dn_type, psize, lsize, compress_type, complevel); } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); data = arc_alloc_compressed_buf(spa, db, psize, lsize, compress_type, complevel); } else { data = arc_alloc_buf(spa, db, type, psize); } return (data); } static arc_buf_t * dbuf_alloc_arcbuf(dmu_buf_impl_t *db) { spa_t *spa = db->db_objset->os_spa; return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size)); } /* * Loan out an arc_buf for read. Return the loaned arc_buf. */ arc_buf_t * dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, B_FALSE, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); } /* * Calculate which level n block references the data at the level 0 offset * provided. */ uint64_t dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset) { if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { /* * The level n blkid is equal to the level 0 blkid divided by * the number of level 0s in a level n block. * * The level 0 blkid is offset >> datablkshift = * offset / 2^datablkshift. * * The number of level 0s in a level n is the number of block * pointers in an indirect block, raised to the power of level. * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). * * Thus, the level n blkid is: offset / * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT)))) * = offset / 2^(datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) * = offset >> (datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) */ const unsigned exp = dn->dn_datablkshift + level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT); if (exp >= 8 * sizeof (offset)) { /* This only happens on the highest indirection level */ ASSERT3U(level, ==, dn->dn_nlevels - 1); return (0); } ASSERT3U(exp, <, 8 * sizeof (offset)); return (offset >> exp); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); } } /* * This function is used to lock the parent of the provided dbuf. This should be * used when modifying or reading db_blkptr. */ db_lock_type_t dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag) { enum db_lock_type ret = DLT_NONE; if (db->db_parent != NULL) { rw_enter(&db->db_parent->db_rwlock, rw); ret = DLT_PARENT; } else if (dmu_objset_ds(db->db_objset) != NULL) { rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw, tag); ret = DLT_OBJSET; } /* * We only return a DLT_NONE lock when it's the top-most indirect block * of the meta-dnode of the MOS. */ return (ret); } /* * We need to pass the lock type in because it's possible that the block will * move from being the topmost indirect block in a dnode (and thus, have no * parent) to not the top-most via an indirection increase. This would cause a * panic if we didn't pass the lock type in. */ void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag) { if (type == DLT_PARENT) rw_exit(&db->db_parent->db_rwlock); else if (type == DLT_OBJSET) rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag); } static void dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); /* * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(zfs_refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); if (buf == NULL) { /* i/o error */ ASSERT(zio == NULL || zio->io_error != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "i/o error"); } else if (db->db_level == 0 && db->db_freed_in_flight) { /* freed in flight */ ASSERT(zio == NULL || zio->io_error == 0); arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "freed in flight"); } else { /* success */ ASSERT(zio == NULL || zio->io_error == 0); dbuf_set_data(db, buf); db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "successful read"); } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL, B_FALSE); } /* * Shortcut for performing reads on bonus dbufs. Returns * an error if we fail to verify the dnode associated with * a decrypted block. Otherwise success. */ static int dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) { int bonuslen, max_bonuslen, err; err = dbuf_read_verify_dnode_crypt(db, flags); if (err) return (err); bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(DB_DNODE_HELD(db)); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) bzero(db->db.db_data, max_bonuslen); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "bonus buffer filled"); return (0); } static void dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn) { blkptr_t *bps = db->db.db_data; uint32_t indbs = 1ULL << dn->dn_indblkshift; int n_bps = indbs >> SPA_BLKPTRSHIFT; for (int i = 0; i < n_bps; i++) { blkptr_t *bp = &bps[i]; ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs); BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ? dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr)); BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1); BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); } } /* * Handle reads on dbufs that are holes, if necessary. This function * requires that the dbuf's mutex is held. Returns success (0) if action * was taken, ENOENT if no action was taken. */ static int dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) { ASSERT(MUTEX_HELD(&db->db_mtx)); int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr); /* * For level 0 blocks only, if the above check fails: * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ if (!is_hole && db->db_level == 0) { is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(db->db_blkptr); } if (is_hole) { dbuf_set_data(db, dbuf_alloc_arcbuf(db)); bzero(db->db.db_data, db->db.db_size); if (db->db_blkptr != NULL && db->db_level > 0 && BP_IS_HOLE(db->db_blkptr) && db->db_blkptr->blk_birth != 0) { dbuf_handle_indirect_hole(db, dn); } db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "hole read satisfied"); return (0); } return (ENOENT); } /* * This function ensures that, when doing a decrypting read of a block, * we make sure we have decrypted the dnode associated with it. We must do * this so that we ensure we are fully authenticating the checksum-of-MACs * tree from the root of the objset down to this block. Indirect blocks are * always verified against their secure checksum-of-MACs assuming that the * dnode containing them is correct. Now that we are doing a decrypting read, * we can be sure that the key is loaded and verify that assumption. This is * especially important considering that we always read encrypted dnode * blocks as raw data (without verifying their MACs) to start, and * decrypt / authenticate them when we need to read an encrypted bonus buffer. */ static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) { int err = 0; objset_t *os = db->db_objset; arc_buf_t *dnode_abuf; dnode_t *dn; zbookmark_phys_t zb; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!os->os_encrypted || os->os_raw_receive || (flags & DB_RF_NO_DECRYPT) != 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL; if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) { DB_DNODE_EXIT(db); return (0); } SET_BOOKMARK(&zb, dmu_objset_id(os), DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid); err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE); /* * An error code of EACCES tells us that the key is still not * available. This is ok if we are only reading authenticated * (and therefore non-encrypted) blocks. */ if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID && !DMU_OT_IS_ENCRYPTED(dn->dn_type)) || (db->db_blkid == DMU_BONUS_BLKID && !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)))) err = 0; DB_DNODE_EXIT(db); return (err); } /* * Drops db_mtx and the parent lock specified by dblt and tag before * returning. */ static int dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, db_lock_type_t dblt, void *tag) { dnode_t *dn; zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; boolean_t bonus_read; err = zio_flags = 0; bonus_read = B_FALSE; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); ASSERT(db->db_parent == NULL || RW_LOCK_HELD(&db->db_parent->db_rwlock)); if (db->db_blkid == DMU_BONUS_BLKID) { err = dbuf_read_bonus(db, dn, flags); goto early_unlock; } err = dbuf_read_hole(db, dn, flags); if (err == 0) goto early_unlock; /* * Any attempt to read a redacted block should result in an error. This * will never happen under normal conditions, but can be useful for * debugging purposes. */ if (BP_IS_REDACTED(db->db_blkptr)) { ASSERT(dsl_dataset_feature_is_active( db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); err = SET_ERROR(EIO); goto early_unlock; } SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), db->db.db_object, db->db_level, db->db_blkid); /* * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) { spa_log_error(db->db_objset->os_spa, &zb); zfs_panic_recover("unencrypted block in encrypted " "object set %llu", dmu_objset_id(db->db_objset)); err = SET_ERROR(EIO); goto early_unlock; } err = dbuf_read_verify_dnode_crypt(db, flags); if (err != 0) goto early_unlock; DB_DNODE_EXIT(db); db->db_state = DB_READ; DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); zio_flags = (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) zio_flags |= ZIO_FLAG_RAW; /* * The zio layer will copy the provided blkptr later, but we need to * do this now so that we can release the parent's rwlock. We have to * do that now so that if dbuf_read_done is called synchronously (on * an l1 cache hit) we don't acquire the db_mtx while holding the * parent's rwlock, which would be a lock ordering violation. */ blkptr_t bp = *db->db_blkptr; dmu_buf_unlock_parent(db, dblt, tag); (void) arc_read(zio, db->db_objset->os_spa, &bp, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); return (err); early_unlock: DB_DNODE_EXIT(db); mutex_exit(&db->db_mtx); dmu_buf_unlock_parent(db, dblt, tag); return (err); } /* * This is our just-in-time copy function. It makes a copy of buffers that * have been modified in a previous transaction group before we access them in * the current active group. * * This function is used in three places: when we are dirtying a buffer for the * first time in a txg, when we are freeing a range in a dnode that includes * this buffer, and when we are accessing a buffer which was received compressed * and later referenced in a WRITE_BYREF record. * * Note that when we are called from dbuf_free_range() we do not put a hold on * the buffer, we just traverse the active dbuf list for the dnode. */ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); ASSERT(db->db_level == 0); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || (dr->dt.dl.dr_data != ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT3U(dr->dr_txg, >=, txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn = DB_DNODE(db); int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP); arc_space_consume(bonuslen, ARC_SPACE_BONUS); bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { arc_buf_t *buf = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf); dr->dt.dl.dr_data = buf; bcopy(db->db.db_data, buf->b_data, arc_buf_size(buf)); } else { db->db_buf = NULL; dbuf_clear_data(db); } } int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; boolean_t prefetch; dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it * can't be freed while we have a hold on the buffer. */ ASSERT(!zfs_refcount_is_zero(&db->db_holds)); if (db->db_state == DB_NOFILL) return (SET_ERROR(EIO)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { spa_t *spa = dn->dn_objset->os_spa; /* * Ensure that this block's dnode has been decrypted if * the caller has requested decrypted data. */ err = dbuf_read_verify_dnode_crypt(db, flags); /* * If the arc buf is compressed or encrypted and the caller * requested uncompressed data, we need to untransform it * before returning. We also call arc_untransform() on any * unauthenticated blocks, which will verify their MAC if * the key is now available. */ if (err == 0 && db->db_buf != NULL && (flags & DB_RF_NO_DECRYPT) == 0 && (arc_is_encrypted(db->db_buf) || arc_is_unauthenticated(db->db_buf) || arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { zbookmark_phys_t zb; SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), db->db.db_object, db->db_level, db->db_blkid); dbuf_fix_old_data(db, spa_syncing_txg(spa)); err = arc_untransform(db->db_buf, spa, &zb, B_FALSE); dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); if (err == 0 && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_hits); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; boolean_t need_wait = B_FALSE; db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); if (zio == NULL && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } err = dbuf_read_impl(db, zio, flags, dblt, FTAG); /* * dbuf_read_impl has dropped db_mtx and our parent's rwlock * for us */ if (!err && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); /* * If we created a zio_root we must execute it to avoid * leaking it, even if it isn't attached to any work due * to an error in dbuf_read_impl(). */ if (need_wait) { if (err == 0) err = zio_wait(zio); else VERIFY0(zio_wait(zio)); } } else { /* * Another reader came in while the dbuf was in flight * between UNCACHED and CACHED. Either a writer will finish * writing the buffer (sending the dbuf to CACHED) or the * first reader's request will reach the read_done callback * and send the dbuf to CACHED. Otherwise, a failure * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); if (prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); /* Skip the wait per the caller's request. */ if ((flags & DB_RF_NEVERWAIT) == 0) { mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, zio_t *, zio); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); } } return (err); } static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); dbuf_set_data(db, dbuf_alloc_arcbuf(db)); db->db_state = DB_FILL; DTRACE_SET_STATE(db, "assigning filled buffer"); } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } mutex_exit(&db->db_mtx); } void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); /* * This assert is valid because dmu_sync() expects to be called by * a zilog's get_data while holding a range lock. This call only * comes from dbuf_dirty() callers who must also hold a range lock. */ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; ASSERT(db->db_data_pending != dr); /* free this block */ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; dr->dt.dl.dr_has_raw_params = B_FALSE; /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do * another (redundant) arc_release(). Therefore, leave * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ arc_release(dr->dt.dl.dr_data, db); } /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. */ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { dmu_buf_impl_t *db_search; dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; avl_index_t where; dbuf_dirty_record_t *dr; if (end_blkid > dn->dn_maxblkid && !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) end_blkid = dn->dn_maxblkid; dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); db_search->db_level = 0; db_search->db_blkid = start_blkid; db_search->db_state = DB_SEARCH; mutex_enter(&dn->dn_dbufs_mtx); db = avl_find(&dn->dn_dbufs, db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = db_next) { db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level != 0 || db->db_blkid > end_blkid) { break; } ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); if (dbuf_undirty(db, tx)) { /* mutex has been dropped and dbuf destroyed */ continue; } if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); continue; } if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } if (zfs_refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); dbuf_destroy(db); continue; } /* The dbuf is referenced */ dr = list_head(&db->db_dirty_records); if (dr != NULL) { if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file * size to reflect that this buffer may * contain new data when we sync. */ if (db->db_blkid != DMU_SPILL_BLKID && db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { /* * This dbuf is not dirty in the open context. * Either uncache it (if its not referenced in * the open context) or reset its contents to * empty. */ dbuf_fix_old_data(db, txg); } } /* clear the contents if its cached */ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); rw_enter(&db->db_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); rw_exit(&db->db_rwlock); arc_buf_freeze(db->db_buf); } mutex_exit(&db->db_mtx); } kmem_free(db_search, sizeof (dmu_buf_impl_t)); mutex_exit(&dn->dn_dbufs_mtx); } void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { arc_buf_t *buf, *old_buf; dbuf_dirty_record_t *dr; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); /* copy old block data to the new block */ old_buf = db->db_buf; bcopy(old_buf->b_data, buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); arc_buf_destroy(old_buf, db); db->db.db_size = size; dr = list_head(&db->db_dirty_records); /* dirty record added by dmu_buf_will_dirty() */ VERIFY(dr != NULL); if (db->db_level == 0) dr->dt.dl.dr_data = buf; ASSERT3U(dr->dr_txg, ==, tx->tx_txg); ASSERT3U(dr->dr_accounted, ==, osize); dr->dr_accounted = size; mutex_exit(&db->db_mtx); dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { objset_t *os __maybe_unused = db->db_objset; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); (void) arc_release(db->db_buf, db); } /* * We already have a dirty record for this TXG, and we are being * dirtied again. */ static void dbuf_redirty(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && db->db_state != DB_NOFILL) { /* Already released on initial dirty, so just thaw. */ ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } } } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; dbuf_dirty_record_t *dr, *dr_next, *dr_head; int txgoff = tx->tx_txg & TXG_MASK; boolean_t drop_struct_rwlock = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ #ifdef ZFS_DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) { rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); } ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty * in syncing context. */ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* * XXX make this true for indirects too? The problem is that * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); dnode_set_dirtyctx(dn, tx, db); if (tx->tx_txg > dn->dn_dirty_txg) dn->dn_dirty_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; /* * If this buffer is already dirty, we're done. */ dr_head = list_head(&db->db_dirty_records); ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); dr_next = dbuf_find_dirty_lte(db, tx->tx_txg); if (dr_next && dr_next->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); dbuf_redirty(dr_next); mutex_exit(&db->db_mtx); return (dr_next); } /* * Only valid if not already dirty. */ ASSERT(dn->dn_object == 0 || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); /* * We should only be dirtying in syncing context if it's the * mos or we're initializing the os or it's a special object. * However, we are allowed to dirty in syncing context provided * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ os = dn->dn_objset; VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); #ifdef ZFS_DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID) { dmu_objset_willuse_space(os, db->db.db_size, tx); } /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); list_link_init(&dr->dr_dirty_node); list_link_init(&dr->dr_dbuf_node); if (db->db_level == 0) { void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so * that we can modify it without impacting * possible other users of this cached data * block. Note that indirect blocks and * private objects are not released until the * syncing state (since they are only modified * then). */ arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db_buf; } ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL); list_create(&dr->dt.di.dr_children, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } if (db->db_blkid != DMU_BONUS_BLKID) dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; list_insert_before(&db->db_dirty_records, dr_next, dr); /* * We could have been freed_in_flight between the dbuf_noread * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_free_ranges[txgoff] != NULL) { range_tree_clear(dn->dn_free_ranges[txgoff], db->db_blkid, 1); } mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } /* * This buffer is now part of this txg */ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); db->db_dirtycnt += 1; ASSERT3U(db->db_dirtycnt, <=, 3); mutex_exit(&db->db_mtx); if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_rwlock = B_TRUE; } /* * If we are overwriting a dedup BP, then unless it is snapshotted, * when we get to syncing context we will need to decrement its * refcount in the DDT. Prefetch the relevant DDT block so that * syncing context won't have to wait for the i/o. */ if (db->db_blkptr != NULL) { db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); ddt_prefetch(os->os_spa, db->db_blkptr); dmu_buf_unlock_parent(db, dblt, FTAG); } /* * We need to hold the dn_struct_rwlock to make this assertion, * because it protects dn_phys / dn_next_nlevels from changing. */ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || dn->dn_next_nlevels[txgoff] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); if (db->db_level == 0) { ASSERT(!db->db_objset->os_raw_receive || dn->dn_maxblkid >= db->db_blkid); dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_rwlock, B_FALSE); ASSERT(dn->dn_maxblkid >= db->db_blkid); } if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; int parent_held = FALSE; if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level + 1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); /* * Since we've dropped the mutex, it's possible that * dbuf_undirty() might have changed this out from under us. */ if (list_head(&db->db_dirty_records) == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); } else { ASSERT(db->db_level + 1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); } dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } static void dbuf_undirty_bonus(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; if (dr->dt.dl.dr_data != db->db.db_data) { struct dnode *dn = DB_DNODE(db); int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); kmem_free(dr->dt.dl.dr_data, max_bonuslen); arc_space_return(max_bonuslen, ARC_SPACE_BONUS); } db->db_data_pending = NULL; ASSERT(list_next(&db->db_dirty_records, dr) == NULL); list_remove(&db->db_dirty_records, dr); if (dr->dr_dbuf->db_level != 0) { mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT3U(db->db_dirtycnt, >, 0); db->db_dirtycnt -= 1; } /* * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr; ASSERT(txg != 0); /* * Due to our use of dn_nlevels below, this can only be called * in open context, unless we are operating on the MOS. * From syncing context, dn_nlevels may be different from the * dn_nlevels used when dbuf was dirtied. */ ASSERT(db->db_objset == dmu_objset_pool(db->db_objset)->dp_meta_objset || txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); /* * If this buffer is not dirty, we're done. */ dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) return (B_FALSE); ASSERT(dr->dr_dbuf == db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), dr->dr_accounted, txg); list_remove(&db->db_dirty_records, dr); /* * Note that there are three places in dbuf_dirty() * where this dirty record may be put on a list. * Make sure to do a list_remove corresponding to * every one of those list_insert calls. */ if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_blkid == DMU_SPILL_BLKID || db->db_level + 1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } DB_DNODE_EXIT(db); if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } return (B_FALSE); } static void dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); /* * Quick check for dirtiness. For already dirty blocks, this * reduces runtime of this function by >90%, and overall performance * by 50% for some workloads (e.g. file deletion with indirect blocks * cached). */ mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); /* * It's possible that it is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ if (dr != NULL) { /* This dbuf is already dirty and cached. */ dbuf_redirty(dr); mutex_exit(&db->db_mtx); return; } } mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) flags |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, flags); (void) dbuf_dirty(db, tx); } void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_will_dirty_impl(db_fake, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx); } boolean_t dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_dirty_record_t *dr; mutex_enter(&db->db_mtx); dr = dbuf_find_dirty_eq(db, tx->tx_txg); mutex_exit(&db->db_mtx); return (dr != NULL); } void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_state = DB_NOFILL; DTRACE_SET_STATE(db, "allocating NOFILL buffer"); dmu_buf_will_fill(db_fake, tx); } void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); dbuf_noread(db); (void) dbuf_dirty(db, tx); } /* * This function is effectively the same as dmu_buf_will_dirty(), but * indicates the caller expects raw encrypted data in the db, and provides * the crypt params (byteorder, salt, iv, mac) which should be stored in the * blkptr_t when this dbuf is written. This is only used for blocks of * dnodes, during raw receive. */ void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_dirty_record_t *dr; /* * dr_has_raw_params is only processed for blocks of dnodes * (see dbuf_sync_dnode_leaf_crypt()). */ ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); ASSERT3U(db->db_level, ==, 0); ASSERT(db->db_objset->os_raw_receive); dmu_buf_will_dirty_impl(db_fake, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); dr = dbuf_find_dirty_eq(db, tx->tx_txg); ASSERT3P(dr, !=, NULL); dr->dt.dl.dr_has_raw_params = B_TRUE; dr->dt.dl.dr_byteorder = byteorder; bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN); bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN); bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN); } static void dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) { struct dirty_leaf *dl; dbuf_dirty_record_t *dr; dr = list_head(&db->db_dirty_records); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; dl->dr_overridden_by = *bp; dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = dr->dr_txg; } /* ARGSUSED */ void dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; dbuf_states_t old_state; mutex_enter(&db->db_mtx); DBUF_VERIFY(db); old_state = db->db_state; db->db_state = DB_CACHED; if (old_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; DTRACE_SET_STATE(db, "fill done handling freed in flight"); } else { DTRACE_SET_STATE(db, "fill done"); } cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); } void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; struct dirty_leaf *dl; dmu_object_type_t type; dbuf_dirty_record_t *dr; if (etype == BP_EMBEDDED_TYPE_DATA) { ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), SPA_FEATURE_EMBEDDED_DATA)); } DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); dmu_buf_will_not_fill(dbuf, tx); dr = list_head(&db->db_dirty_records); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); BP_SET_TYPE(&dl->dr_overridden_by, type); BP_SET_LEVEL(&dl->dr_overridden_by, 0); BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = dr->dr_txg; } void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; dmu_object_type_t type; ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); dmu_buf_will_not_fill(dbuf, tx); blkptr_t bp = { { { {0} } } }; BP_SET_TYPE(&bp, type); BP_SET_LEVEL(&bp, 0); BP_SET_BIRTH(&bp, tx->tx_txg, 0); BP_SET_REDACTED(&bp); BPE_SET_LSIZE(&bp, dbuf->db_size); dbuf_override_impl(db, &bp, tx); } /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); ASSERT(buf != NULL); ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); if (db->db_state == DB_CACHED && zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { /* * In practice, we will never have a case where we have an * encrypted arc buffer while additional holds exist on the * dbuf. We don't handle this here so we simply assert that * fact instead. */ ASSERT(!arc_is_encrypted(buf)); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); arc_buf_destroy(buf, db); - xuio_stat_wbuf_copied(); return; } - xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); if (!arc_released(db->db_buf)) { ASSERT(dr->dt.dl.dr_override_state == DR_OVERRIDDEN); arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); db->db_state = DB_FILL; DTRACE_SET_STATE(db, "filling assigned arcbuf"); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); dmu_buf_fill_done(&db->db, tx); } void dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(zfs_refcount_is_zero(&db->db_holds)); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; } if (db->db_blkid == DMU_BONUS_BLKID) { int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); if (db->db.db_data != NULL) { kmem_free(db->db.db_data, bonuslen); arc_space_return(bonuslen, ARC_SPACE_BONUS); db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "buffer cleared"); } } dbuf_clear_data(db); if (multilist_link_active(&db->db_cache_link)) { ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(dbuf_caches[db->db_caching_status].cache, db); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); } db->db_caching_status = DB_NO_CACHE; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); ASSERT(list_is_empty(&db->db_dirty_records)); db->db_state = DB_EVICTING; DTRACE_SET_STATE(db, "buffer eviction started"); db->db_blkptr = NULL; /* * Now that db_state is DB_EVICTING, nobody else can find this via * the hash table. We can now drop db_mtx, which allows us to * acquire the dn_dbufs_mtx. */ mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID) { boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); if (needlock) mutex_enter_nested(&dn->dn_dbufs_mtx, NESTED_SINGLE); avl_remove(&dn->dn_dbufs, db); membar_producer(); DB_DNODE_EXIT(db); if (needlock) mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), * so the dnode cannot be moved until after we release the hold. * The membar_producer() ensures visibility of the decremented * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, db, B_TRUE); db->db_dnode_handle = NULL; dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } ASSERT(zfs_refcount_is_zero(&db->db_holds)); db->db_parent = NULL; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT(!multilist_link_active(&db->db_cache_link)); kmem_cache_free(dbuf_kmem_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); /* * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) { mutex_enter(&parent->db_mtx); dbuf_rele_and_unlock(parent, db, B_TRUE); } } /* * Note: While bpp will always be updated if the function returns success, * parentp will not be updated if the dnode does not have dn_dbuf filled in; * this happens when the dnode is the meta-dnode, or {user|group|project}used * object. */ __attribute__((always_inline)) static inline int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp) { *parentp = NULL; *bpp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); if (blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) *bpp = DN_SPILL_BLKPTR(dn->dn_phys); else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; mutex_exit(&dn->dn_mtx); return (0); } int nlevels = (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(level * epbs, <, 64); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); /* * This assertion shouldn't trip as long as the max indirect block size * is less than 1M. The reason for this is that up to that point, * the number of levels required to address an entire object with blocks * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55 * (i.e. we can address the entire object), objects will all use at most * N-1 levels and the assertion won't overflow. However, once epbs is * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be * enough to address an entire object, so objects will have 5 levels, * but then this assertion will overflow. * * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we * need to redo this logic to handle overflows. */ ASSERT(level >= nlevels || ((nlevels - level - 1) * epbs) + highbit64(dn->dn_phys->dn_nblkptr) <= 64); if (level >= nlevels || blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr << ((nlevels - level - 1) * epbs)) || (fail_sparse && blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ return (SET_ERROR(ENOENT)); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err; err = dbuf_hold_impl(dn, level + 1, blkid >> epbs, fail_sparse, FALSE, NULL, parentp); if (err) return (err); err = dbuf_read(*parentp, NULL, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; return (err); } rw_enter(&(*parentp)->db_rwlock, RW_READER); *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) ASSERT(BP_IS_HOLE(*bpp)); rw_exit(&(*parentp)->db_rwlock); return (0); } else { /* the block is referenced from the dnode */ ASSERT3U(level, ==, nlevels-1); ASSERT(dn->dn_phys->dn_nblkptr == 0 || blkid < dn->dn_phys->dn_nblkptr); if (dn->dn_dbuf) { dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; } *bpp = &dn->dn_phys->dn_blkptr[blkid]; return (0); } } static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dbuf_node)); db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; db->db_user = NULL; db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; db->db_pending_evict = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "bonus buffer created"); db->db_caching_status = DB_NO_CACHE; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /* * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. * This prevents a possible deadlock with someone * trying to look up this dbuf before it's added to the * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); db->db_state = DB_EVICTING; /* not worth logging this state change */ if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); DBUF_STAT_BUMP(hash_insert_race); return (odb); } avl_add(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "regular buffer created"); db->db_caching_status = DB_NO_CACHE; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || zfs_refcount_count(&dn->dn_holds) > 0); (void) zfs_refcount_add(&dn->dn_holds, db); dprintf_dbuf(db, "db=%p\n", db); return (db); } /* * This function returns a block pointer and information about the object, * given a dnode and a block. This is a publicly accessible version of * dbuf_findbp that only returns some information, rather than the * dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock * should be locked as (at least) a reader. */ int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift) { dmu_buf_impl_t *dbp = NULL; blkptr_t *bp2; int err = 0; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2); if (err == 0) { *bp = *bp2; if (dbp != NULL) dbuf_rele(dbp, NULL); if (datablkszsec != NULL) *datablkszsec = dn->dn_phys->dn_datablkszsec; if (indblkshift != NULL) *indblkshift = dn->dn_phys->dn_indblkshift; } return (err); } typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */ void *dpa_arg; /* prefetch completion arg */ } dbuf_prefetch_arg_t; static void dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done) { if (dpa->dpa_cb != NULL) dpa->dpa_cb(dpa->dpa_arg, io_done); kmem_free(dpa, sizeof (*dpa)); } static void dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *iobp, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; dbuf_prefetch_fini(dpa, B_TRUE); if (abuf != NULL) arc_buf_destroy(abuf, private); } /* * Actually issue the prefetch read for the block given. */ static void dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) { ASSERT(!BP_IS_REDACTED(bp) || dsl_dataset_feature_is_active( dpa->dpa_dnode->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return (dbuf_prefetch_fini(dpa, B_FALSE)); int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; /* dnodes are always read as raw and then converted later */ if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) && dpa->dpa_curlevel == 0) zio_flags |= ZIO_FLAG_RAW; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, dbuf_issue_final_prefetch_done, dpa, dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb); } /* * Called when an indirect block above our prefetch target is read in. This * will either read in the next indirect block down the tree or issue the actual * prefetch if the next block down is our target. */ static void dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *iobp, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); if (abuf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); return (dbuf_prefetch_fini(dpa, B_TRUE)); } ASSERT(zio == NULL || zio->io_error == 0); /* * The dpa_dnode is only valid if we are called with a NULL * zio. This indicates that the arc_read() returned without * first calling zio_read() to issue a physical read. Once * a physical read is made the dpa_dnode must be invalidated * as the locks guarding it may have been dropped. If the * dpa_dnode is still valid, then we want to add it to the dbuf * cache. To do so, we must hold the dbuf associated with the block * we just prefetched, read its contents so that we associate it * with an arc_buf_t, and then release it. */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) { ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); dpa->dpa_dnode = NULL; } else if (dpa->dpa_dnode != NULL) { uint64_t curblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, dpa->dpa_curlevel, curblkid, FTAG); if (db == NULL) { arc_buf_destroy(abuf, private); return (dbuf_prefetch_fini(dpa, B_TRUE)); } (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); dbuf_rele(db, FTAG); } dpa->dpa_curlevel--; uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); ASSERT(!BP_IS_REDACTED(bp) || dsl_dataset_feature_is_active( dpa->dpa_dnode->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { dbuf_prefetch_fini(dpa, B_TRUE); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) iter_aflags |= ARC_FLAG_L2CACHE; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } arc_buf_destroy(abuf, private); } /* * Issue prefetch reads for the given block on the given level. If the indirect * blocks above that block are not in memory, we will read them in * asynchronously. As a result, this call never blocks waiting for a read to * complete. Note that the prefetch might fail if the dataset is encrypted and * the encryption key is unmapped before the IO completes. */ int dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, void *arg) { blkptr_t bp; int epbs, nlevels, curlevel; uint64_t curblkid; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (blkid > dn->dn_maxblkid) goto no_issue; if (level == 0 && dnode_block_freed(dn, blkid)) goto no_issue; /* * This dnode hasn't been written to disk yet, so there's nothing to * prefetch. */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) goto no_issue; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) goto no_issue; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db != NULL) { mutex_exit(&db->db_mtx); /* * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ goto no_issue; } /* * Find the closest ancestor (indirect block) of the target block * that is present in the cache. In this indirect block, we will * find the bp that is at curlevel, curblkid. */ curlevel = level; curblkid = blkid; while (curlevel < nlevels - 1) { int parent_level = curlevel + 1; uint64_t parent_blkid = curblkid >> epbs; dmu_buf_impl_t *db; if (dbuf_hold_impl(dn, parent_level, parent_blkid, FALSE, TRUE, FTAG, &db) == 0) { blkptr_t *bpp = db->db_buf->b_data; bp = bpp[P2PHASE(curblkid, 1 << epbs)]; dbuf_rele(db, FTAG); break; } curlevel = parent_level; curblkid = parent_blkid; } if (curlevel == nlevels - 1) { /* No cached indirect blocks found. */ ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); bp = dn->dn_phys->dn_blkptr[curblkid]; } ASSERT(!BP_IS_REDACTED(&bp) || dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp)) goto no_issue; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, level, blkid); dpa->dpa_curlevel = curlevel; dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; dpa->dpa_cb = cb; dpa->dpa_arg = arg; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) dpa->dpa_aflags |= ARC_FLAG_L2CACHE; /* * If we have the indirect just above us, no need to do the asynchronous * prefetch chain; we'll just run the last step ourselves. If we're at * a higher level, though, we want to issue the prefetches for all the * indirect blocks asynchronously, so we can go on with whatever we were * doing. */ if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) iter_aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, curlevel, curblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, &bp, dbuf_prefetch_indirect_done, dpa, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } /* * We use pio here instead of dpa_zio since it's possible that * dpa may have already been freed. */ zio_nowait(pio); return (1); no_issue: if (cb != NULL) cb(arg, B_FALSE); return (0); } int dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags) { return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL)); } /* * Helper function for dbuf_hold_impl() to copy a buffer. Handles * the case of encrypted, compressed and uncompressed buffers by * allocating the new buffer, respectively, with arc_alloc_raw_buf(), * arc_alloc_compressed_buf() or arc_alloc_buf().* * * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl(). */ noinline static void dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db) { dbuf_dirty_record_t *dr = db->db_data_pending; arc_buf_t *newdata, *data = dr->dt.dl.dr_data; newdata = dbuf_alloc_arcbuf_from_arcbuf(db, data); dbuf_set_data(db, newdata); rw_enter(&db->db_rwlock, RW_WRITER); bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); rw_exit(&db->db_rwlock); } /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp) { dmu_buf_impl_t *db, *parent = NULL; /* If the pool has been created, verify the tx_sync_lock is not held */ spa_t *spa = dn->dn_objset->os_spa; dsl_pool_t *dp = spa->spa_dsl_pool; if (dp != NULL) { ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock)); } ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); *dbp = NULL; /* dbuf_find() returns with db_mtx held */ db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db == NULL) { blkptr_t *bp = NULL; int err; if (fail_uncached) return (SET_ERROR(ENOENT)); ASSERT3P(parent, ==, NULL); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { if (err == 0 && bp && BP_IS_HOLE(bp)) err = SET_ERROR(ENOENT); if (err) { if (parent) dbuf_rele(parent, NULL); return (err); } } if (err && err != ENOENT) return (err); db = dbuf_create(dn, level, blkid, parent, bp); } if (fail_uncached && db->db_state != DB_CACHED) { mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } if (db->db_buf != NULL) { arc_buf_access(db->db_buf); ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); } ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); /* * If this buffer is currently syncing out, and we are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; if (dr->dt.dl.dr_data == db->db_buf) dbuf_hold_copy(dn, db); } if (multilist_link_active(&db->db_cache_link)) { ASSERT(zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(dbuf_caches[db->db_caching_status].cache, db); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); } db->db_caching_status = DB_NO_CACHE; } (void) zfs_refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (parent) dbuf_rele(parent, NULL); ASSERT3P(DB_DNODE(db), ==, dn); ASSERT3U(db->db_blkid, ==, blkid); ASSERT3U(db->db_level, ==, level); *dbp = db; return (0); } dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); return (err ? NULL : db); } void dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); } int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); if (blksz == 0) blksz = SPA_MINBLOCKSIZE; ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); dbuf_new_size(db, blksz, tx); return (0); } void dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) { dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = zfs_refcount_add(&db->db_holds, tag); VERIFY3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref boolean_t dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, void *tag) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *found_db; boolean_t result = B_FALSE; if (blkid == DMU_BONUS_BLKID) found_db = dbuf_find_bonus(os, obj); else found_db = dbuf_find(os, obj, 0, blkid); if (found_db != NULL) { if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { (void) zfs_refcount_add(&db->db_holds, tag); result = B_TRUE; } mutex_exit(&found_db->db_mtx); } return (result); } /* * If you call dbuf_rele() you had better not be referencing the dnode handle * unless you have some other direct or indirect hold on the dnode. (An indirect * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, tag, B_FALSE); } void dmu_buf_rele(dmu_buf_t *db, void *tag) { dbuf_rele((dmu_buf_impl_t *)db, tag); } /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. The 'evicting' * argument should be set if we are already in the dbuf-evicting code * path, in which case we don't want to recursively evict. This allows us to * avoid deeply nested stacks that would have a call flow similar to this: * * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() * ^ | * | | * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ * */ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) { int64_t holds; uint64_t size; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus * buffer has a corresponding dnode hold. */ holds = zfs_refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); /* * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ if (db->db_buf != NULL && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) dbuf_evict_user(db); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn; boolean_t evict_dbuf = db->db_pending_evict; /* * If the dnode moves here, we cannot cross this * barrier until the move completes. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); atomic_dec_32(&dn->dn_dbufs_count); /* * Decrementing the dbuf count means that the bonus * buffer's dnode hold is no longer discounted in * dnode_move(). The dnode cannot move until after * the dnode_rele() below. */ DB_DNODE_EXIT(db); /* * Do not reference db after its lock is dropped. * Another thread may evict it. */ mutex_exit(&db->db_mtx); if (evict_dbuf) dnode_evict_bonus(dn); dnode_rele(dn, db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); dbuf_destroy(db); } else if (arc_released(db->db_buf)) { /* * This dbuf has anonymous data associated with it. */ dbuf_destroy(db); } else { boolean_t do_arc_evict = B_FALSE; blkptr_t bp; spa_t *spa = dmu_objset_spa(db->db_objset); if (!DBUF_IS_CACHEABLE(db) && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr) && !BP_IS_EMBEDDED(db->db_blkptr)) { do_arc_evict = B_TRUE; bp = *db->db_blkptr; } if (!DBUF_IS_CACHEABLE(db) || db->db_pending_evict) { dbuf_destroy(db); } else if (!multilist_link_active(&db->db_cache_link)) { ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); dbuf_cached_state_t dcs = dbuf_include_in_metadata_cache(db) ? DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; db->db_caching_status = dcs; multilist_insert(dbuf_caches[dcs].cache, db); size = zfs_refcount_add_many( &dbuf_caches[dcs].size, db->db.db_size, db); if (dcs == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMP(metadata_cache_count); DBUF_STAT_MAX( metadata_cache_size_bytes_max, size); } else { DBUF_STAT_BUMP( cache_levels[db->db_level]); DBUF_STAT_BUMP(cache_count); DBUF_STAT_INCR( cache_levels_bytes[db->db_level], db->db.db_size); DBUF_STAT_MAX(cache_size_bytes_max, size); } mutex_exit(&db->db_mtx); if (dcs == DB_DBUF_CACHE && !evicting) dbuf_evict_notify(size); } if (do_arc_evict) arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } } #pragma weak dmu_buf_refcount = dbuf_refcount uint64_t dbuf_refcount(dmu_buf_impl_t *db) { return (zfs_refcount_count(&db->db_holds)); } uint64_t dmu_buf_user_refcount(dmu_buf_t *db_fake) { uint64_t holds; dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; mutex_enter(&db->db_mtx); ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt); holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt; mutex_exit(&db->db_mtx); return (holds); } void * dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; mutex_enter(&db->db_mtx); dbuf_verify_user(db, DBVU_NOT_EVICTING); if (db->db_user == old_user) db->db_user = new_user; else old_user = db->db_user; dbuf_verify_user(db, DBVU_NOT_EVICTING); mutex_exit(&db->db_mtx); return (old_user); } void * dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, NULL, user)); } void * dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_user_immediate_evict = TRUE; return (dmu_buf_set_user(db_fake, user)); } void * dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_verify_user(db, DBVU_NOT_EVICTING); return (db->db_user); } void dmu_buf_user_evict_wait() { taskq_wait(dbu_evict_taskq); } blkptr_t * dmu_buf_get_blkptr(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_blkptr); } objset_t * dmu_buf_get_objset(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_objset); } dnode_t * dmu_buf_dnode_enter(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_ENTER(dbi); return (DB_DNODE(dbi)); } void dmu_buf_dnode_exit(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_EXIT(dbi); } static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { /* ASSERT(dmu_tx_is_syncing(tx) */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_blkptr != NULL) return; if (db->db_blkid == DMU_SPILL_BLKID) { db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys); BP_ZERO(db->db_blkptr); return; } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was * no available blkptrs from the dnode, or it was * inappropriate to hook it in (i.e., nlevels mismatch). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ASSERT(db->db_parent == NULL); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); } else { dmu_buf_impl_t *parent = db->db_parent; int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT(dn->dn_phys->dn_nlevels > 1); if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; } db->db_blkptr = (blkptr_t *)parent->db.db_data + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } } static void dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; void *data = dr->dt.dl.dr_data; ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(DB_DNODE_HELD(db)); ASSERT(db->db_blkid == DMU_BONUS_BLKID); ASSERT(data != NULL); dnode_t *dn = DB_DNODE(db); ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys)); DB_DNODE_EXIT(db); dbuf_sync_leaf_verify_bonus_dnode(dr); dbuf_undirty_bonus(dr); dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); } /* * When syncing out a blocks of dnodes, adjust the block to deal with * encryption. Normally, we make sure the block is decrypted before writing * it. If we have crypt params, then we are writing a raw (encrypted) block, * from a raw receive. In this case, set the ARC buf's crypt params so * that the BP will be filled with the correct byteorder, salt, iv, and mac. */ static void dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr) { int err; dmu_buf_impl_t *db = dr->dr_dbuf; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); ASSERT3U(db->db_level, ==, 0); if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) { zbookmark_phys_t zb; /* * Unfortunately, there is currently no mechanism for * syncing context to handle decryption errors. An error * here is only possible if an attacker maliciously * changed a dnode block and updated the associated * checksums going up the block tree. */ SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), db->db.db_object, db->db_level, db->db_blkid); err = arc_untransform(db->db_buf, db->db_objset->os_spa, &zb, B_TRUE); if (err) panic("Invalid dnode block MAC"); } else if (dr->dt.dl.dr_has_raw_params) { (void) arc_release(dr->dt.dl.dr_data, db); arc_convert_to_raw(dr->dt.dl.dr_data, dmu_objset_id(db->db_objset), dr->dt.dl.dr_byteorder, DMU_OT_DNODE, dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac); } } /* * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it * is critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); ASSERT(db->db_level > 0); DBUF_VERIFY(db); /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); } /* * Verify that the size of the data in our bonus buffer does not exceed * its recorded size. * * The purpose of this verification is to catch any cases in development * where the size of a phys structure (i.e space_map_phys_t) grows and, * due to incorrect feature management, older pools expect to read more * data even though they didn't actually write it to begin with. * * For a example, this would catch an error in the feature logic where we * open an older pool and we expect to write the space map histogram of * a space map with size SPACE_MAP_SIZE_V0. */ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) { #ifdef ZFS_DEBUG dnode_t *dn = DB_DNODE(dr->dr_dbuf); /* * Encrypted bonus buffers can have data past their bonuslen. * Skip the verification of these blocks. */ if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)) return; uint16_t bonuslen = dn->dn_phys->dn_bonuslen; uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(bonuslen, <=, maxbonuslen); arc_buf_t *datap = dr->dt.dl.dr_data; char *datap_end = ((char *)datap) + bonuslen; char *datap_max = ((char *)datap) + maxbonuslen; /* ensure that everything is zero after our data */ for (; datap_end < datap_max; datap_end++) ASSERT(*datap_end == 0); #endif } /* * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is * critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } DBUF_VERIFY(db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { /* * In the previous transaction group, the bonus buffer * was entirely used to store the attributes for the * dnode which overrode the dn_spill field. However, * when adding more attributes to the file a spill * block was required to hold the extra attributes. * * Make sure to clear the garbage left in the dn_spill * field from the previous attributes in the bonus * buffer. Otherwise, after writing out the spill * block to the new allocated dva, it will free * the old block pointed to by the invalid dn_spill. */ db->db_blkptr = NULL; } dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dr->dr_dbuf == db); dbuf_sync_bonus(dr, tx); return; } os = dn->dn_objset; /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we * don't check the dr_override_state until we have returned from * dbuf_check_blkptr. */ dbuf_check_blkptr(dn, db); /* * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); cv_wait(&db->db_changed, &db->db_mtx); ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } /* * If this is a dnode block, ensure it is appropriately encrypted * or decrypted, depending on what we are writing to it this txg. */ if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT) dbuf_prepare_encrypted_dnode_leaf(dr); if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && zfs_refcount_count(&db->db_holds) > 1 && dr->dt.dl.dr_override_state != DR_OVERRIDDEN && *datap == db->db_buf) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), * then make a copy before we start the write so that * any modifications from the open txg will not leak * into this write. * * NOTE: this copy does not need to be made for * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ *datap = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf); bcopy(db->db.db_data, (*datap)->b_data, arc_buf_size(*datap)); } db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr); DB_DNODE_EXIT(db); } else { /* * Although zio_nowait() does not "wait for an IO", it does * initiate the IO. If this is an empty write it seems plausible * that the IO could actually be completed before the nowait * returns. We need to DB_DNODE_EXIT() first in case * zio_nowait() invalidates the dbuf. */ DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; while ((dr = list_head(list))) { if (dr->dr_zio != NULL) { /* * If we find an already initialized zio then we * are processing the meta-dnode, and we have finished. * The dbufs for all dnodes are put back on the list * during processing, so that we can zio_wait() * these IOs after initiating all child IOs. */ ASSERT3U(dr->dr_dbuf->db.db_object, ==, DMU_META_DNODE_OBJECT); break; } if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { VERIFY3U(dr->dr_dbuf->db_level, ==, level); } list_remove(list, dr); if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); else dbuf_sync_leaf(dr, tx); } } /* ARGSUSED */ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; int i; ASSERT3P(db->db_blkptr, !=, NULL); ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (bp->blk_birth != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_bonustype) || BP_IS_EMBEDDED(bp)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); } mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(bp)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); } #endif if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); if (db->db_blkid > dn->dn_phys->dn_maxblkid && db->db_blkid != DMU_SPILL_BLKID) { ASSERT0(db->db_objset->os_raw_receive); dn->dn_phys->dn_maxblkid = db->db_blkid; } mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { i = 0; while (i < db->db.db_size) { dnode_phys_t *dnp = (void *)(((char *)db->db.db_data) + i); i += DNODE_MIN_SIZE; if (dnp->dn_type != DMU_OT_NONE) { fill++; i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } } } else { if (BP_IS_HOLE(bp)) { fill = 0; } else { fill = 1; } } } else { blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; fill += BP_GET_FILL(ibp); } } DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) BP_SET_FILL(bp, fill); mutex_exit(&db->db_mtx); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG); *db->db_blkptr = *bp; dmu_buf_unlock_parent(db, dblt, FTAG); } /* ARGSUSED */ /* * This function gets called just prior to running through the compression * stage of the zio pipeline. If we're an indirect block comprised of only * holes, then we want this indirect to be compressed away to a hole. In * order to do that we must zero out any information about the holes that * this indirect points to prior to before we try to compress it. */ static void dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp; unsigned int epbs, i; ASSERT3U(db->db_level, >, 0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(epbs, <, 31); /* Determine if all our children are holes */ for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) break; } /* * If all the children are holes, then zero them all out so that * we may get compressed away. */ if (i == 1ULL << epbs) { /* * We only found holes. Grab the rwlock to prevent * anybody from reading the blocks we're about to * zero out. */ rw_enter(&db->db_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); } /* * The SPA will call this callback several times for each zio - once * for every physical child i/o (zio->io_phys_children times). This * allows the DMU to monitor the progress of each logical i/o. For example, * there may be 2 copies of an indirect block, or many fragments of a RAID-Z * block. There may be a long delay before all copies/fragments are completed, * so this callback allows us to retire dirty space gradually, as the physical * i/os complete. */ /* ARGSUSED */ static void dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) { dmu_buf_impl_t *db = arg; objset_t *os = db->db_objset; dsl_pool_t *dp = dmu_objset_pool(os); dbuf_dirty_record_t *dr; int delta = 0; dr = db->db_data_pending; ASSERT3U(dr->dr_txg, ==, zio->io_txg); /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding * error will be cleaned up by dbuf_write_done(). */ delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); } /* ARGSUSED */ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; blkptr_t *bp_orig = &zio->io_bp_orig; blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; dbuf_dirty_record_t *dr; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); /* * For nopwrites and rewrites we ensure that the bp matches our * original and bypass all the accounting. */ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } mutex_enter(&db->db_mtx); DBUF_VERIFY(db); dr = db->db_data_pending; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); ASSERT(list_next(&db->db_dirty_records, dr) == NULL); list_remove(&db->db_dirty_records, dr); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); DB_DNODE_EXIT(db); } #endif if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { int epbs __maybe_unused = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(db->db_blkid, <=, dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); /* * If we didn't do a physical write in this ZIO and we * still ended up here, it means that the space of the * dbuf that we just released (and undirtied) above hasn't * been marked as undirtied in the pool's accounting. * * Thus, we undirty that space in the pool's view of the * world here. For physical writes this type of update * happens in dbuf_write_physdone(). * * If we did a physical write, cleanup any rounding errors * that came up due to writing multiple copies of a block * on disk [see dbuf_write_physdone()]. */ if (zio->io_phys_children == 0) { dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, zio->io_txg); } else { dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted % zio->io_phys_children, zio->io_txg); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); } static void dbuf_write_nofill_ready(zio_t *zio) { dbuf_write_ready(zio, NULL, zio->io_private); } static void dbuf_write_nofill_done(zio_t *zio) { dbuf_write_done(zio, NULL, zio->io_private); } static void dbuf_write_override_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; dbuf_write_ready(zio, NULL, db); } static void dbuf_write_override_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *obp = &dr->dt.dl.dr_overridden_by; mutex_enter(&db->db_mtx); if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); if (zio->io_abd != NULL) abd_put(zio->io_abd); } typedef struct dbuf_remap_impl_callback_arg { objset_t *drica_os; uint64_t drica_blk_birth; dmu_tx_t *drica_tx; } dbuf_remap_impl_callback_arg_t; static void dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, void *arg) { dbuf_remap_impl_callback_arg_t *drica = arg; objset_t *os = drica->drica_os; spa_t *spa = dmu_objset_spa(os); dmu_tx_t *tx = drica->drica_tx; ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); if (os == spa_meta_objset(spa)) { spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); } else { dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, size, drica->drica_blk_birth, tx); } } static void dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) { blkptr_t bp_copy = *bp; spa_t *spa = dmu_objset_spa(dn->dn_objset); dbuf_remap_impl_callback_arg_t drica; ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); drica.drica_os = dn->dn_objset; drica.drica_blk_birth = bp->blk_birth; drica.drica_tx = tx; if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { /* * If the blkptr being remapped is tracked by a livelist, * then we need to make sure the livelist reflects the update. * First, cancel out the old blkptr by appending a 'FREE' * entry. Next, add an 'ALLOC' to track the new version. This * way we avoid trying to free an inaccurate blkptr at delete. * Note that embedded blkptrs are not tracked in livelists. */ if (dn->dn_objset != spa_meta_objset(spa)) { dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && bp->blk_birth > ds->ds_dir->dd_origin_txg) { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LIVELIST)); bplist_append(&ds->ds_dir->dd_pending_frees, bp); bplist_append(&ds->ds_dir->dd_pending_allocs, &bp_copy); } } /* * The db_rwlock prevents dbuf_read_impl() from * dereferencing the BP while we are changing it. To * avoid lock contention, only grab it when we are actually * changing the BP. */ if (rw != NULL) rw_enter(rw, RW_WRITER); *bp = bp_copy; if (rw != NULL) rw_exit(rw); } } /* * Remap any existing BP's to concrete vdevs, if possible. */ static void dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(db->db_objset); ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) return; if (db->db_level > 0) { blkptr_t *bp = db->db.db_data; for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx); } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, DMU_OT_DNODE); for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i += dnp[i].dn_extra_slots + 1) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL : &dn->dn_dbuf->db_rwlock); dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock, tx); } } } } /* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_phys_t zb; zio_prop_t zp; zio_t *pio; /* parent I/O */ int wp_flag = 0; ASSERT(dmu_tx_is_syncing(tx)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* * Private object buffers are released here rather * than in dbuf_dirty() since they are only modified * in the syncing context and we don't want the * overhead of making multiple copies of the data. */ if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { dbuf_release_bp(db); } dbuf_remap(dn, db, tx); } } if (parent != dn->dn_dbuf) { /* Our parent is an indirect block. */ /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); /* * We're about to modify our parent's db_data by modifying * our block pointer, so the parent must be released. */ ASSERT(arc_released(parent->db_buf)); pio = parent->db_data_pending->dr_zio; } else { /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); pio = dn->dn_zio; } ASSERT(db->db_level == 0 || data == db->db_buf); ASSERT3U(db->db_blkptr->blk_birth, <=, txg); ASSERT(pio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); DB_DNODE_EXIT(db); /* * We copy the blkptr now (rather than when we instantiate the dirty * record), because its value can change between open context and * syncing context. We do not need to hold dn_struct_rwlock to read * db_blkptr because we are in syncing context. */ dr->dr_bp_copy = *db->db_blkptr; if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ abd_t *contents = (data != NULL) ? abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, dbuf_write_override_ready, NULL, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); /* * For indirect blocks, we want to setup the children * ready callback so that we can properly handle an indirect * block that only contains holes. */ arc_write_done_func_t *children_ready_cb = NULL; if (db->db_level != 0) children_ready_cb = dbuf_write_children_ready; dr->dr_zio = arc_write(pio, os->os_spa, txg, &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } EXPORT_SYMBOL(dbuf_find); EXPORT_SYMBOL(dbuf_is_metadata); EXPORT_SYMBOL(dbuf_destroy); EXPORT_SYMBOL(dbuf_loan_arcbuf); EXPORT_SYMBOL(dbuf_whichblock); EXPORT_SYMBOL(dbuf_read); EXPORT_SYMBOL(dbuf_unoverride); EXPORT_SYMBOL(dbuf_free_range); EXPORT_SYMBOL(dbuf_new_size); EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_is_dirty); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); EXPORT_SYMBOL(dmu_buf_rele); EXPORT_SYMBOL(dbuf_assign_arcbuf); EXPORT_SYMBOL(dbuf_prefetch); EXPORT_SYMBOL(dbuf_hold_impl); EXPORT_SYMBOL(dbuf_hold); EXPORT_SYMBOL(dbuf_hold_level); EXPORT_SYMBOL(dbuf_create_bonus); EXPORT_SYMBOL(dbuf_spill_set_blksz); EXPORT_SYMBOL(dbuf_rm_spill); EXPORT_SYMBOL(dbuf_add_ref); EXPORT_SYMBOL(dbuf_rele); EXPORT_SYMBOL(dbuf_rele_and_unlock); EXPORT_SYMBOL(dbuf_refcount); EXPORT_SYMBOL(dbuf_sync_list); EXPORT_SYMBOL(dmu_buf_set_user); EXPORT_SYMBOL(dmu_buf_set_user_ie); EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_get_blkptr); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW, "Maximum size in bytes of the dbuf cache."); ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW, "Percentage over dbuf_cache_max_bytes when dbufs must be evicted " "directly."); ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW, "Percentage below dbuf_cache_max_bytes when the evict thread stops " "evicting dbufs."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW, "Maximum size in bytes of the dbuf metadata cache."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW, "Set the size of the dbuf cache to a log2 fraction of arc size."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW, "Set the size of the dbuf metadata cache to a log2 fraction of arc " "size."); /* END CSTYLED */ diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 2c96645214f8..14d864fbcda9 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1,2483 +1,2312 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif /* * Enable/disable nopwrite feature. */ int zfs_nopwrite_enabled = 1; /* * Tunable to control percentage of dirtied L1 blocks from frees allowed into * one TXG. After this threshold is crossed, additional dirty blocks from frees * will wait until the next TXG. * A value of zero will disable this throttle. */ unsigned long zfs_per_txg_dirty_frees_percent = 5; /* * Enable/disable forcing txg sync when dirty in dmu_offset_next. */ int zfs_dmu_offset_next_sync = 0; /* * Limit the amount we can prefetch with one call to this amount. This * helps to limit the amount of memory that can be used by prefetching. * Larger objects should be prefetched a bit at a time. */ int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "object array" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "packed nvlist" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "packed nvlist size" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map" }, {DMU_BSWAP_UINT64, TRUE, FALSE, TRUE, "ZIL intent log" }, {DMU_BSWAP_DNODE, TRUE, FALSE, TRUE, "DMU dnode" }, {DMU_BSWAP_OBJSET, TRUE, TRUE, FALSE, "DMU objset" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL directory" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL directory child map"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset snap map" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL props" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL dataset" }, {DMU_BSWAP_ZNODE, TRUE, FALSE, FALSE, "ZFS znode" }, {DMU_BSWAP_OLDACL, TRUE, FALSE, TRUE, "ZFS V0 ACL" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "ZFS plain file" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS directory" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "ZFS master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS delete queue" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "zvol object" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "zvol prop" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "other uint8[]" }, {DMU_BSWAP_UINT64, FALSE, FALSE, TRUE, "other uint64[]" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "other ZAP" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "persistent error log" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "SPA history" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA history offsets" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "Pool properties" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL permissions" }, {DMU_BSWAP_ACL, TRUE, FALSE, TRUE, "ZFS ACL" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "ZFS SYSACL" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "FUID table" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "FUID table size" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset next clones"}, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan work queue" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project used" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project quota"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "snapshot refcount tags"}, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT ZAP algorithm" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT statistics" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "System attributes" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr registration" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr layouts" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan translations" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "deduplicated block" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL deadlist map" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL deadlist map hdr" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dir clones" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; static int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp) { uint64_t blkid; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (0); } int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (err); } int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_bonus_max(void) { return (DN_OLD_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else if (newsize < 0 || newsize > db_fake->db_size) { error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (!DMU_OT_IS_VALID(type)) { error = SET_ERROR(EINVAL); } else if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); dn = DB_DNODE(db); type = dn->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * Lookup and hold the bonus buffer for the provided dnode. If the dnode * has not yet been allocated a new bonus dbuf a will be allocated. * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp, uint32_t flags) { dmu_buf_impl_t *db; int error; uint32_t db_flags = DB_RF_MUST_SUCCEED; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (zfs_refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); atomic_inc_32(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); error = dbuf_read(db, NULL, db_flags); if (error) { dnode_evict_bonus(dn); dbuf_rele(db, tag); *dbp = NULL; return (error); } *dbp = &db->db; return (0); } int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else { dbuf_rele(db, tag); *dbp = NULL; } return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = SET_ERROR(EINVAL); } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = SET_ERROR(ENOENT); } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; uint32_t db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio; ASSERT(length <= DMU_MAX_ACCESS); /* * Note: We directly notify the prefetch code of this read, so that * we can tell it about the multi-block read. dbuf_read() only knows * about the one block it is accessing. */ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); zio_nowait(zio); return (SET_ERROR(EIO)); } /* initiate async i/o */ if (read) (void) dbuf_read(db, zio, dbuf_flags); dbp[i] = &db->db; } if ((flags & DMU_READ_NO_PREFETCH) == 0 && DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { dmu_zfetch(&dn->dn_zfetch, blkid, nblks, read && DNODE_IS_CACHEABLE(dn), B_TRUE); } rw_exit(&dn->dn_struct_rwlock); /* wait for async i/o */ err = zio_wait(zio); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /* * Issue prefetch i/os for the given blocks. If level is greater than 0, the * indirect blocks prefetched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. * * Note that if the indirect blocks above the blocks being prefetched are not * in cache, they will be asynchronously read in. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; int nblks, err; if (len == 0) { /* they're interested in the bonus buffer */ dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, level, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } /* * See comment before the definition of dmu_prefetch_max. */ len = MIN(len, dmu_prefetch_max); /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the * last block we want to prefetch, and dbuf_whichblock(dn, level, * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { blkid = dbuf_whichblock(dn, level, offset); for (int i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first * offset that should be freed and l1blks is set to the number of level 1 * indirect blocks found within the chunk. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) { uint64_t blks; uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = (uint64_t)dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT3U(minimum, <=, *start); /* * Check if we can free the entire range assuming that all of the * L1 blocks in this range have data. If we can, we use this * worst case value as an estimate so we can avoid having to look * at the object's actual data. */ uint64_t total_l1blks = (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) / iblkrange; if (total_l1blks <= maxblks) { *l1blks = total_l1blks; *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); for (blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* * dnode_next_offset(BACKWARDS) will find an allocated L1 * indirect block at or before the input offset. We must * decrement *start so that it is at the end of the region * to search. */ (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { *start = minimum; break; } else if (err != 0) { *l1blks = blks; return (err); } /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); } if (*start < minimum) *start = minimum; *l1blks = blks; return (0); } /* * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set, * otherwise return false. * Used below in dmu_free_long_range_impl() to enable abort when unmounting */ /*ARGSUSED*/ static boolean_t dmu_objset_zfs_unmounting(objset_t *os) { #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) return (zfs_get_vfs_flag_unmounted(os)); #endif return (B_FALSE); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length) { uint64_t object_size; int err; uint64_t dirty_frees_threshold; dsl_pool_t *dp = dmu_objset_pool(os); if (dn == NULL) return (SET_ERROR(EINVAL)); object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; if (offset >= object_size) return (0); if (zfs_per_txg_dirty_frees_percent <= 100) dirty_frees_threshold = zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; else dirty_frees_threshold = zfs_dirty_data_max / 20; if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin, chunk_len; uint64_t l1blks; dmu_tx_t *tx; if (dmu_objset_zfs_unmounting(dn->dn_objset)) return (SET_ERROR(EINTR)); chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ err = get_next_chunk(dn, &chunk_begin, offset, &l1blks); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); chunk_len = chunk_end - chunk_begin; tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); /* * Mark this transaction as typically resulting in a net * reduction in space used. */ dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } uint64_t txg = dmu_tx_get_txg(tx); mutex_enter(&dp->dp_lock); uint64_t long_free_dirty = dp->dp_long_free_dirty_pertxg[txg & TXG_MASK]; mutex_exit(&dp->dp_lock); /* * To avoid filling up a TXG with just frees, wait for * the next TXG to open before freeing more chunks if * we have reached the threshold of frees. */ if (dirty_frees_threshold != 0 && long_free_dirty >= dirty_frees_threshold) { DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay); dmu_tx_commit(tx); txg_wait_open(dp, 0, B_TRUE); continue; } /* * In order to prevent unnecessary write throttling, for each * TXG, we track the cumulative size of L1 blocks being dirtied * in dnode_free_range() below. We compare this number to a * tunable threshold, past which we prevent new L1 dirty freeing * blocks from being added into the open TXG. See * dmu_free_long_range_impl() for details. The threshold * prevents write throttle activation due to dirty freeing L1 * blocks taking up a large percentage of zfs_dirty_data_max. */ mutex_enter(&dp->dp_lock); dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] += l1blks << dn->dn_indblkshift; mutex_exit(&dp->dp_lock); DTRACE_PROBE3(free__long__range, uint64_t, long_free_dirty, uint64_t, chunk_len, uint64_t, txg); dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx); length -= chunk_len; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length); /* * It is important to zero out the maxblkid when freeing the entire * file, so that (a) subsequent calls to dmu_free_long_range_impl() * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); return (err); } int dmu_free_long_object(objset_t *os, uint64_t object) { dmu_tx_t *tx; int err; err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { if (err == 0) err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } static int dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dmu_buf_t **dbp; int numbufs, err = 0; /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { uint64_t newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); size = newsz; } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } return (err); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_read_impl(dn, offset, size, buf, flags); dnode_rele(dn, FTAG); return (err); } int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { return (dmu_read_impl(dn, offset, size, buf, flags)); } static void dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { int i; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } /* * Note: Lustre is an external consumer of this interface. */ void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); VERIFY0(dmu_buf_hold_noread(os, object, offset, FTAG, &db)); dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, uncompressed_size, compressed_size, byteorder, tx); dmu_buf_rele(db, FTAG); } void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { int numbufs, i; dmu_buf_t **dbp; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) dmu_buf_redact(dbp[i], tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } -/* - * DMU support for xuio - */ -kstat_t *xuio_ksp = NULL; - -typedef struct xuio_stats { - /* loaned yet not returned arc_buf */ - kstat_named_t xuiostat_onloan_rbuf; - kstat_named_t xuiostat_onloan_wbuf; - /* whether a copy is made when loaning out a read buffer */ - kstat_named_t xuiostat_rbuf_copied; - kstat_named_t xuiostat_rbuf_nocopy; - /* whether a copy is made when assigning a write buffer */ - kstat_named_t xuiostat_wbuf_copied; - kstat_named_t xuiostat_wbuf_nocopy; -} xuio_stats_t; - -static xuio_stats_t xuio_stats = { - { "onloan_read_buf", KSTAT_DATA_UINT64 }, - { "onloan_write_buf", KSTAT_DATA_UINT64 }, - { "read_buf_copied", KSTAT_DATA_UINT64 }, - { "read_buf_nocopy", KSTAT_DATA_UINT64 }, - { "write_buf_copied", KSTAT_DATA_UINT64 }, - { "write_buf_nocopy", KSTAT_DATA_UINT64 } -}; - -#define XUIOSTAT_INCR(stat, val) \ - atomic_add_64(&xuio_stats.stat.value.ui64, (val)) -#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) - -#ifdef HAVE_UIO_ZEROCOPY -int -dmu_xuio_init(xuio_t *xuio, int nblk) -{ - dmu_xuio_t *priv; - uio_t *uio = &xuio->xu_uio; - - uio->uio_iovcnt = nblk; - uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); - - priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); - priv->cnt = nblk; - priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); - priv->iovp = (iovec_t *)uio->uio_iov; - XUIO_XUZC_PRIV(xuio) = priv; - - if (XUIO_XUZC_RW(xuio) == UIO_READ) - XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); - else - XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); - - return (0); -} - -void -dmu_xuio_fini(xuio_t *xuio) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - int nblk = priv->cnt; - - kmem_free(priv->iovp, nblk * sizeof (iovec_t)); - kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); - kmem_free(priv, sizeof (dmu_xuio_t)); - - if (XUIO_XUZC_RW(xuio) == UIO_READ) - XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); - else - XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); -} - -/* - * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } - * and increase priv->next by 1. - */ -int -dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) -{ - struct iovec *iov; - uio_t *uio = &xuio->xu_uio; - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - int i = priv->next++; - - ASSERT(i < priv->cnt); - ASSERT(off + n <= arc_buf_lsize(abuf)); - iov = (iovec_t *)uio->uio_iov + i; - iov->iov_base = (char *)abuf->b_data + off; - iov->iov_len = n; - priv->bufs[i] = abuf; - return (0); -} - -int -dmu_xuio_cnt(xuio_t *xuio) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - return (priv->cnt); -} - -arc_buf_t * -dmu_xuio_arcbuf(xuio_t *xuio, int i) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - - ASSERT(i < priv->cnt); - return (priv->bufs[i]); -} - -void -dmu_xuio_clear(xuio_t *xuio, int i) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - - ASSERT(i < priv->cnt); - priv->bufs[i] = NULL; -} -#endif /* HAVE_UIO_ZEROCOPY */ - -static void -xuio_stat_init(void) -{ - xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", - KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (xuio_ksp != NULL) { - xuio_ksp->ks_data = &xuio_stats; - kstat_install(xuio_ksp); - } -} - -static void -xuio_stat_fini(void) -{ - if (xuio_ksp != NULL) { - kstat_delete(xuio_ksp); - xuio_ksp = NULL; - } -} - -void -xuio_stat_wbuf_copied(void) -{ - XUIOSTAT_BUMP(xuiostat_wbuf_copied); -} - -void -xuio_stat_wbuf_nocopy(void) -{ - XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); -} - #ifdef _KERNEL int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; -#ifdef HAVE_UIO_ZEROCOPY - xuio_t *xuio = NULL; -#endif /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, uio_offset(uio), size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); -#ifdef HAVE_UIO_ZEROCOPY - if (xuio) { - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - arc_buf_t *dbuf_abuf = dbi->db_buf; - arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); - err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); - if (!err) - uio_advance(uio, tocpy); - - if (abuf == dbuf_abuf) - XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); - else - XUIOSTAT_BUMP(xuiostat_rbuf_copied); - } else -#endif #ifdef __FreeBSD__ err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, uio); #else err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); #endif if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_read_uio_dnode(dn, uio, size); DB_DNODE_EXIT(db); return (err); } /* * Read 'size' bytes into the uio buffer. * From the specified object * Starting at offset uio->uio_loffset. */ int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_read_uio_dnode(dn, uio, size); dnode_rele(dn, FTAG); return (err); } int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, uio_offset(uio), size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); /* * XXX uiomove could block forever (eg.nfs-backed * pages). There needs to be a uiolockdown() function * to lock the pages in memory, so that uiomove won't * block. */ #ifdef __FreeBSD__ err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, uio); #else err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); #endif if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); } /* * Write 'size' bytes from the uio buffer. * To the specified object. * Starting at offset uio->uio_loffset. */ int dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } #endif /* _KERNEL */ /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); arc_buf_destroy(buf, FTAG); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *db; objset_t *os = dn->dn_objset; uint64_t object = dn->dn_object; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, FTAG); if (db == NULL) return (SET_ERROR(EIO)); rw_exit(&dn->dn_struct_rwlock); /* * We can only assign if the offset is aligned, the arc buf is the * same size as the dbuf, and the dbuf is not metadata. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { /* compressed bufs must always be assignable to their dbuf */ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); - XUIOSTAT_BUMP(xuiostat_wbuf_copied); } return (0); } int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { int err; dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; DB_DNODE_ENTER(dbuf); err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx); DB_DNODE_EXIT(dbuf); return (err); } typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; zgd_t *dsa_zgd; dmu_tx_t *dsa_tx; } dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { if (BP_IS_HOLE(bp)) { /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); BP_SET_FILL(bp, 1); } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; zgd_t *zgd = dsa->dsa_zgd; /* * Record the vdev(s) backing this blkptr so they can be flushed after * the writes for the lwb have completed. */ if (zio->io_error == 0) { zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); } mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint8_t chksum = BP_GET_CHECKSUM(bp_orig); ASSERT(BP_EQUAL(bp, bp_orig)); VERIFY(BP_EQUAL(bp, db->db_blkptr)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); VERIFY(zio_checksum_table[chksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; /* * Old style holes are filled with all zeros, whereas * new-style holes maintain their lsize, type, level, * and birth time (see zio_write_compress). While we * need to reset the BP_SET_LSIZE() call that happened * in dmu_sync_ready for old style holes, we do *not* * want to wipe out the information contained in new * style holes. Thus, only zero out the block pointer if * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && dr->dt.dl.dr_overridden_by.blk_birth == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; zgd_t *zgd = dsa->dsa_zgd; if (zio->io_error == 0) { /* * Record the vdev(s) backing this blkptr so they can be * flushed after the writes for the lwb have completed. */ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); if (!BP_IS_HOLE(bp)) { blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); abd_put(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); } /* * In order to prevent the zgd's lwb from being free'd prior to * dmu_sync_late_arrival_done() being called, we have to ensure * the lwb's "max txg" takes this tx's txg into account. */ zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; /* * Since we are currently syncing this txg, it's nontrivial to * determine what BP to nopwrite against, so we disable nopwrite. * * When syncing, the db_blkptr is initially the BP of the previous * txg. We can not nopwrite against it because it will be changed * (this is similar to the non-late-arrival case where the dbuf is * dirty in a future txg). * * Then dbuf_write_ready() sets bp_blkptr to the location we will write. * We can not nopwrite against it because although the BP will not * (typically) be changed, the data has not yet been persisted to this * location. * * Finally, when dbuf_write_done() is called, it is theoretically * possible to always nopwrite, because the data that was written in * this txg is the same data that we are trying to write. However we * would need to check that this dbuf is not dirty in any future * txg's (as we do in the normal dmu_sync() path). For simplicity, we * don't nopwrite in this case. */ zp->zp_nopwrite = B_FALSE; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr, *dr_next; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } dr_next = list_next(&db->db_dirty_records, dr); ASSERT(dr_next == NULL || dr_next->dr_txg < txg); if (db->db_blkptr != NULL) { /* * We need to fill in zgd_bp with the current blkptr so that * the nopwrite code can check if we're writing the same * data that's already on disk. We can only nopwrite if we * are sure that after making the copy, db_blkptr will not * change until our i/o completes. We ensure this by * holding the db_mtx, and only allowing nopwrite if the * block is not already dirty (see below). This is verified * by dmu_sync_done(), which VERIFYs that the db_blkptr has * not changed. */ *zgd->zgd_bp = *db->db_blkptr; } /* * Assume the on-disk data is X, the current syncing data (in * txg - 1) is Y, and the current in-memory data is Z (currently * in dmu_sync). * * We usually want to perform a nopwrite if X and Z are the * same. However, if Y is different (i.e. the BP is going to * change before this write takes effect), then a nopwrite will * be incorrect - we would override with X, which could have * been freed when Y was written. * * (Note that this is not a concern when we are nop-writing from * syncing context, because X and Y must be identical, because * all previous txgs have been synced.) * * Therefore, we disable nopwrite if the current BP could change * before this TXG. There are two ways it could change: by * being dirty (dr_next is non-NULL), or by being freed * (dnode_block_freed()). This behavior is verified by * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EALREADY)); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_nlevels(dn, nlevels, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (0); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's checksum function. This * check ensures that the receiving system can understand the * checksum function transmitted. */ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's compression function. This * check ensures that the receiving system can understand the * compression function transmitted. */ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; uint8_t complevel = os->os_complevel; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; boolean_t encrypt = B_FALSE; int copies = os->os_copies; /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_METADATA) || (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_MOST && (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) copies++; } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_OFF; } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); complevel = zio_complevel_select(os->os_spa, compress, complevel, complevel); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checksum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and * compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually * exclusive. */ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } /* * All objects in an encrypted objset are protected from modification * via a MAC. Encrypted objects store their IV and salt in the last DVA * in the bp, so we cannot use all copies. Encrypted objects are also * not subject to nopwrite since writing the same data will still * result in a new ciphertext. Only encrypted blocks can be dedup'd * to avoid ambiguity in the dedup code since the DDT does not store * object types. */ if (os->os_encrypted && (wp & WP_NOFILL) == 0) { encrypt = B_TRUE; if (DMU_OT_IS_ENCRYPTED(type)) { copies = MIN(copies, SPA_DVAS_PER_BP - 1); nopwrite = B_FALSE; } else { dedup = B_FALSE; } if (level <= 0 && (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) { compress = ZIO_COMPRESS_EMPTY; } } zp->zp_compress = compress; zp->zp_complevel = complevel; zp->zp_checksum = checksum; zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; bzero(zp->zp_salt, ZIO_DATA_SALT_LEN); bzero(zp->zp_iv, ZIO_DATA_IV_LEN); bzero(zp->zp_mac, ZIO_DATA_MAC_LEN); zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? os->os_zpl_special_smallblock : 0; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); } /* * This function is only called from zfs_holey_common() for zpl_llseek() * in order to determine the location of holes. In order to accurately * report holes all dirty data must be synced to disk. This causes extremely * poor performance when seeking for holes in a dirty file. As a compromise, * only provide hole data when the dnode is clean. When a dnode is dirty * report the dnode as having no holes which is always a safe thing to do. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int i, err; boolean_t clean = B_TRUE; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); /* * Check if dnode is dirty */ for (i = 0; i < TXG_SIZE; i++) { if (multilist_link_active(&dn->dn_dirty_link[i])) { clean = B_FALSE; break; } } /* * If compatibility option is on, sync any current changes before * we go trundling through the block pointers. */ if (!clean && zfs_dmu_offset_next_sync) { clean = B_TRUE; dnode_rele(dn, FTAG); txg_wait_synced(dmu_objset_pool(os), 0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); } if (clean) err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); else err = SET_ERROR(EBUSY); dnode_rele(dn, FTAG); return (err); } void __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp = dn->dn_phys; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); __dmu_object_info_from_dnode(dn, doi); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add in number of slots used for the dnode itself */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + dn->dn_num_slots; DB_DNODE_EXIT(db); } void dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *dnsize = dn->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } /* ARGSUSED */ void byteswap_uint8_array(void *vbuf, size_t size) { } void dmu_init(void) { abd_init(); zfs_dbgmsg_init(); sa_cache_init(); - xuio_stat_init(); dmu_objset_init(); dnode_init(); zfetch_init(); dmu_tx_init(); l2arc_init(); arc_init(); dbuf_init(); } void dmu_fini(void) { arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); dmu_tx_fini(); zfetch_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); - xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); abd_fini(); } EXPORT_SYMBOL(dmu_bonus_hold); EXPORT_SYMBOL(dmu_bonus_hold_by_dnode); EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); EXPORT_SYMBOL(dmu_buf_rele_array); EXPORT_SYMBOL(dmu_prefetch); EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_read_by_dnode); EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write_by_dnode); EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_object_info); EXPORT_SYMBOL(dmu_object_info_from_dnode); EXPORT_SYMBOL(dmu_object_info_from_db); EXPORT_SYMBOL(dmu_object_size_from_db); EXPORT_SYMBOL(dmu_object_dnsize_from_db); EXPORT_SYMBOL(dmu_object_set_nlevels); EXPORT_SYMBOL(dmu_object_set_blocksize); EXPORT_SYMBOL(dmu_object_set_maxblkid); EXPORT_SYMBOL(dmu_object_set_checksum); EXPORT_SYMBOL(dmu_object_set_compress); EXPORT_SYMBOL(dmu_offset_next); EXPORT_SYMBOL(dmu_write_policy); EXPORT_SYMBOL(dmu_sync); EXPORT_SYMBOL(dmu_request_arcbuf); EXPORT_SYMBOL(dmu_return_arcbuf); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf); EXPORT_SYMBOL(dmu_buf_hold); EXPORT_SYMBOL(dmu_ot); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW, "Enable NOP writes"); ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, ULONG, ZMOD_RW, "Percentage of dirtied blocks from frees in one TXG"); ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW, "Enable forcing txg sync to find holes"); ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, INT, ZMOD_RW, "Limit one prefetch call to this size"); /* END CSTYLED */