diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index cd2496bf7e95..1676020d04d3 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1,1961 +1,1963 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019 Datto Inc. * Portions Copyright 2010 Robert Milkowski * Copyright (c) 2021, Colm Buckley * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H extern __attribute__((visibility("default"))) #include #include #ifdef __cplusplus extern "C" { #endif /* * Types and constants shared between userland and the kernel. */ /* * Each dataset can be one of the following types. These constants can be * combined into masks that can be passed to various functions. */ typedef enum { ZFS_TYPE_INVALID = 0, ZFS_TYPE_FILESYSTEM = (1 << 0), ZFS_TYPE_SNAPSHOT = (1 << 1), ZFS_TYPE_VOLUME = (1 << 2), ZFS_TYPE_POOL = (1 << 3), ZFS_TYPE_BOOKMARK = (1 << 4), ZFS_TYPE_VDEV = (1 << 5), } zfs_type_t; /* * NB: lzc_dataset_type should be updated whenever a new objset type is added, * if it represents a real type of a dataset that can be created from userland. */ typedef enum dmu_objset_type { DMU_OST_NONE, DMU_OST_META, DMU_OST_ZFS, DMU_OST_ZVOL, DMU_OST_OTHER, /* For testing only! */ DMU_OST_ANY, /* Be careful! */ DMU_OST_NUMTYPES } dmu_objset_type_t; #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) /* * All of these include the terminating NUL byte. */ #define ZAP_MAXNAMELEN 256 #define ZAP_MAXNAMELEN_NEW 1024 #define ZAP_MAXVALUELEN (1024 * 8) #define ZAP_OLDMAXVALUELEN 1024 #define ZFS_MAX_DATASET_NAME_LEN 256 /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in module/zcommon/zfs_prop.c. */ typedef enum { ZPROP_CONT = -2, ZPROP_INVAL = -1, ZPROP_USERPROP = ZPROP_INVAL, ZFS_PROP_TYPE = 0, ZFS_PROP_CREATION, ZFS_PROP_USED, ZFS_PROP_AVAILABLE, ZFS_PROP_REFERENCED, ZFS_PROP_COMPRESSRATIO, ZFS_PROP_MOUNTED, ZFS_PROP_ORIGIN, ZFS_PROP_QUOTA, ZFS_PROP_RESERVATION, ZFS_PROP_VOLSIZE, ZFS_PROP_VOLBLOCKSIZE, ZFS_PROP_RECORDSIZE, ZFS_PROP_MOUNTPOINT, ZFS_PROP_SHARENFS, ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_ATIME, ZFS_PROP_DEVICES, ZFS_PROP_EXEC, ZFS_PROP_SETUID, ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, ZFS_PROP_ACLMODE, ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, ZFS_PROP_NAME, /* not exposed to the user */ ZFS_PROP_CANMOUNT, ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, ZFS_PROP_VERSION, ZFS_PROP_UTF8ONLY, ZFS_PROP_NORMALIZE, ZFS_PROP_CASE, ZFS_PROP_VSCAN, ZFS_PROP_NBMAND, ZFS_PROP_SHARESMB, ZFS_PROP_REFQUOTA, ZFS_PROP_REFRESERVATION, ZFS_PROP_GUID, ZFS_PROP_PRIMARYCACHE, ZFS_PROP_SECONDARYCACHE, ZFS_PROP_USEDSNAP, ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ZFS_PROP_DEFER_DESTROY, ZFS_PROP_USERREFS, ZFS_PROP_LOGBIAS, ZFS_PROP_UNIQUE, /* not exposed to the user */ ZFS_PROP_OBJSETID, ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, ZFS_PROP_DNODESIZE, ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, ZFS_PROP_INCONSISTENT, /* not exposed to the user */ ZFS_PROP_VOLMODE, ZFS_PROP_FILESYSTEM_LIMIT, ZFS_PROP_SNAPSHOT_LIMIT, ZFS_PROP_FILESYSTEM_COUNT, ZFS_PROP_SNAPSHOT_COUNT, ZFS_PROP_SNAPDEV, ZFS_PROP_ACLTYPE, ZFS_PROP_SELINUX_CONTEXT, ZFS_PROP_SELINUX_FSCONTEXT, ZFS_PROP_SELINUX_DEFCONTEXT, ZFS_PROP_SELINUX_ROOTCONTEXT, ZFS_PROP_RELATIME, ZFS_PROP_REDUNDANT_METADATA, ZFS_PROP_OVERLAY, ZFS_PROP_PREV_SNAP, ZFS_PROP_RECEIVE_RESUME_TOKEN, ZFS_PROP_ENCRYPTION, ZFS_PROP_KEYLOCATION, ZFS_PROP_KEYFORMAT, ZFS_PROP_PBKDF2_SALT, ZFS_PROP_PBKDF2_ITERS, ZFS_PROP_ENCRYPTION_ROOT, ZFS_PROP_KEY_GUID, ZFS_PROP_KEYSTATUS, ZFS_PROP_REMAPTXG, /* obsolete - no longer used */ ZFS_PROP_SPECIAL_SMALL_BLOCKS, ZFS_PROP_IVSET_GUID, /* not exposed to the user */ ZFS_PROP_REDACTED, ZFS_PROP_REDACT_SNAPS, ZFS_PROP_SNAPSHOTS_CHANGED, ZFS_PROP_PREFETCH, ZFS_PROP_VOLTHREADING, ZFS_PROP_DIRECT, ZFS_PROP_LONGNAME, ZFS_NUM_PROPS } zfs_prop_t; typedef enum { ZFS_PROP_USERUSED, ZFS_PROP_USERQUOTA, ZFS_PROP_GROUPUSED, ZFS_PROP_GROUPQUOTA, ZFS_PROP_USEROBJUSED, ZFS_PROP_USEROBJQUOTA, ZFS_PROP_GROUPOBJUSED, ZFS_PROP_GROUPOBJQUOTA, ZFS_PROP_PROJECTUSED, ZFS_PROP_PROJECTQUOTA, ZFS_PROP_PROJECTOBJUSED, ZFS_PROP_PROJECTOBJQUOTA, ZFS_NUM_USERQUOTA_PROPS } zfs_userquota_prop_t; _SYS_FS_ZFS_H const char *const zfs_userquota_prop_prefixes[ ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. Properties must be registered in zfs_prop_init(). */ typedef enum { ZPOOL_PROP_INVAL = -1, ZPOOL_PROP_NAME, ZPOOL_PROP_SIZE, ZPOOL_PROP_CAPACITY, ZPOOL_PROP_ALTROOT, ZPOOL_PROP_HEALTH, ZPOOL_PROP_GUID, ZPOOL_PROP_VERSION, ZPOOL_PROP_BOOTFS, ZPOOL_PROP_DELEGATION, ZPOOL_PROP_AUTOREPLACE, ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, ZPOOL_PROP_AUTOEXPAND, ZPOOL_PROP_DEDUPDITTO, ZPOOL_PROP_DEDUPRATIO, ZPOOL_PROP_FREE, ZPOOL_PROP_ALLOCATED, ZPOOL_PROP_READONLY, ZPOOL_PROP_ASHIFT, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, ZPOOL_PROP_FRAGMENTATION, ZPOOL_PROP_LEAKED, ZPOOL_PROP_MAXBLOCKSIZE, ZPOOL_PROP_TNAME, ZPOOL_PROP_MAXDNODESIZE, ZPOOL_PROP_MULTIHOST, ZPOOL_PROP_CHECKPOINT, ZPOOL_PROP_LOAD_GUID, ZPOOL_PROP_AUTOTRIM, ZPOOL_PROP_COMPATIBILITY, ZPOOL_PROP_BCLONEUSED, ZPOOL_PROP_BCLONESAVED, ZPOOL_PROP_BCLONERATIO, ZPOOL_PROP_DEDUP_TABLE_SIZE, ZPOOL_PROP_DEDUP_TABLE_QUOTA, ZPOOL_PROP_DEDUPCACHED, ZPOOL_NUM_PROPS } zpool_prop_t; /* Small enough to not hog a whole line of printout in zpool(8). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_BOOLEAN_NA 2 #define ZPROP_VALUE "value" #define ZPROP_SOURCE "source" typedef enum { ZPROP_SRC_NONE = 0x1, ZPROP_SRC_DEFAULT = 0x2, ZPROP_SRC_TEMPORARY = 0x4, ZPROP_SRC_LOCAL = 0x8, ZPROP_SRC_INHERITED = 0x10, ZPROP_SRC_RECEIVED = 0x20 } zprop_source_t; #define ZPROP_SRC_ALL 0x3f #define ZPROP_SOURCE_VAL_RECVD "$recvd" #define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" /* * Dataset flag implemented as a special entry in the props zap object * indicating that the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties * just as it did in earlier versions, and thereafter, local properties are * preserved. */ #define ZPROP_HAS_RECVD "$hasrecvd" typedef enum { ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ } zprop_errflags_t; typedef int (*zprop_func)(int, void *); /* * Properties to be set on the root file system of a new pool * are stuffed into their own nvlist, which is then included in * the properties nvlist with the pool properties. */ #define ZPOOL_ROOTFS_PROPS "root-props-nvl" /* * Length of 'written@' and 'written#' */ #define ZFS_WRITTEN_PROP_PREFIX_LEN 8 /* * VDEV properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zpool_prop.c. */ typedef enum { VDEV_PROP_INVAL = -1, VDEV_PROP_USERPROP = VDEV_PROP_INVAL, VDEV_PROP_NAME, VDEV_PROP_CAPACITY, VDEV_PROP_STATE, VDEV_PROP_GUID, VDEV_PROP_ASIZE, VDEV_PROP_PSIZE, VDEV_PROP_ASHIFT, VDEV_PROP_SIZE, VDEV_PROP_FREE, VDEV_PROP_ALLOCATED, VDEV_PROP_COMMENT, VDEV_PROP_EXPANDSZ, VDEV_PROP_FRAGMENTATION, VDEV_PROP_BOOTSIZE, VDEV_PROP_PARITY, VDEV_PROP_PATH, VDEV_PROP_DEVID, VDEV_PROP_PHYS_PATH, VDEV_PROP_ENC_PATH, VDEV_PROP_FRU, VDEV_PROP_PARENT, VDEV_PROP_CHILDREN, VDEV_PROP_NUMCHILDREN, VDEV_PROP_READ_ERRORS, VDEV_PROP_WRITE_ERRORS, VDEV_PROP_CHECKSUM_ERRORS, VDEV_PROP_INITIALIZE_ERRORS, VDEV_PROP_OPS_NULL, VDEV_PROP_OPS_READ, VDEV_PROP_OPS_WRITE, VDEV_PROP_OPS_FREE, VDEV_PROP_OPS_CLAIM, VDEV_PROP_OPS_TRIM, VDEV_PROP_BYTES_NULL, VDEV_PROP_BYTES_READ, VDEV_PROP_BYTES_WRITE, VDEV_PROP_BYTES_FREE, VDEV_PROP_BYTES_CLAIM, VDEV_PROP_BYTES_TRIM, VDEV_PROP_REMOVING, VDEV_PROP_ALLOCATING, VDEV_PROP_FAILFAST, VDEV_PROP_CHECKSUM_N, VDEV_PROP_CHECKSUM_T, VDEV_PROP_IO_N, VDEV_PROP_IO_T, VDEV_PROP_RAIDZ_EXPANDING, VDEV_PROP_SLOW_IO_N, VDEV_PROP_SLOW_IO_T, VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, VDEV_NUM_PROPS } vdev_prop_t; /* * Dataset property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H const char *zfs_prop_default_string(zfs_prop_t); _SYS_FS_ZFS_H uint64_t zfs_prop_default_numeric(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_readonly(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_visible(zfs_prop_t prop); _SYS_FS_ZFS_H boolean_t zfs_prop_inheritable(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_setonce(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_encryption_key_param(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_valid_keylocation(const char *, boolean_t); _SYS_FS_ZFS_H const char *zfs_prop_to_name(zfs_prop_t); _SYS_FS_ZFS_H zfs_prop_t zfs_name_to_prop(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_user(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_userquota(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_written(const char *); _SYS_FS_ZFS_H int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); _SYS_FS_ZFS_H boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t); /* * Pool property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H zpool_prop_t zpool_name_to_prop(const char *); _SYS_FS_ZFS_H const char *zpool_prop_to_name(zpool_prop_t); _SYS_FS_ZFS_H const char *zpool_prop_default_string(zpool_prop_t); _SYS_FS_ZFS_H uint64_t zpool_prop_default_numeric(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_readonly(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_setonce(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_feature(const char *); _SYS_FS_ZFS_H boolean_t zpool_prop_unsupported(const char *); _SYS_FS_ZFS_H int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * VDEV property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H vdev_prop_t vdev_name_to_prop(const char *); _SYS_FS_ZFS_H boolean_t vdev_prop_user(const char *name); _SYS_FS_ZFS_H const char *vdev_prop_to_name(vdev_prop_t); _SYS_FS_ZFS_H const char *vdev_prop_default_string(vdev_prop_t); _SYS_FS_ZFS_H uint64_t vdev_prop_default_numeric(vdev_prop_t); _SYS_FS_ZFS_H boolean_t vdev_prop_readonly(vdev_prop_t prop); _SYS_FS_ZFS_H int vdev_prop_index_to_string(vdev_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int vdev_prop_string_to_index(vdev_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H boolean_t zpool_prop_vdev(const char *name); _SYS_FS_ZFS_H uint64_t vdev_prop_random_value(vdev_prop_t prop, uint64_t seed); /* * Definitions for the Delegation. */ typedef enum { ZFS_DELEG_WHO_UNKNOWN = 0, ZFS_DELEG_USER = 'u', ZFS_DELEG_USER_SETS = 'U', ZFS_DELEG_GROUP = 'g', ZFS_DELEG_GROUP_SETS = 'G', ZFS_DELEG_EVERYONE = 'e', ZFS_DELEG_EVERYONE_SETS = 'E', ZFS_DELEG_CREATE = 'c', ZFS_DELEG_CREATE_SETS = 'C', ZFS_DELEG_NAMED_SET = 's', ZFS_DELEG_NAMED_SET_SETS = 'S' } zfs_deleg_who_type_t; typedef enum { ZFS_DELEG_NONE = 0, ZFS_DELEG_PERM_LOCAL = 1, ZFS_DELEG_PERM_DESCENDENT = 2, ZFS_DELEG_PERM_LOCALDESCENDENT = 3, ZFS_DELEG_PERM_CREATE = 4 } zfs_deleg_inherit_t; #define ZFS_DELEG_PERM_UID "uid" #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" #define ZFS_MLSLABEL_DEFAULT "none" #define ZFS_SMB_ACL_SRC "src" #define ZFS_SMB_ACL_TARGET "target" typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_NOAUTO = 2 } zfs_canmount_type_t; typedef enum { ZFS_LOGBIAS_LATENCY = 0, ZFS_LOGBIAS_THROUGHPUT = 1 } zfs_logbias_op_t; typedef enum zfs_share_op { ZFS_SHARE_NFS = 0, ZFS_UNSHARE_NFS = 1, ZFS_SHARE_SMB = 2, ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; typedef enum zfs_smb_acl_op { ZFS_SMB_ACL_ADD, ZFS_SMB_ACL_REMOVE, ZFS_SMB_ACL_RENAME, ZFS_SMB_ACL_PURGE } zfs_smb_acl_op_t; typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, ZFS_CACHE_ALL = 2 } zfs_cache_type_t; typedef enum { ZFS_SYNC_STANDARD = 0, ZFS_SYNC_ALWAYS = 1, ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; typedef enum { ZFS_XATTR_OFF = 0, ZFS_XATTR_DIR = 1, ZFS_XATTR_SA = 2 } zfs_xattr_type_t; typedef enum { ZFS_DNSIZE_LEGACY = 0, ZFS_DNSIZE_AUTO = 1, ZFS_DNSIZE_1K = 1024, ZFS_DNSIZE_2K = 2048, ZFS_DNSIZE_4K = 4096, ZFS_DNSIZE_8K = 8192, ZFS_DNSIZE_16K = 16384 } zfs_dnsize_type_t; typedef enum { ZFS_REDUNDANT_METADATA_ALL, ZFS_REDUNDANT_METADATA_MOST, ZFS_REDUNDANT_METADATA_SOME, ZFS_REDUNDANT_METADATA_NONE } zfs_redundant_metadata_type_t; typedef enum { ZFS_VOLMODE_DEFAULT = 0, ZFS_VOLMODE_GEOM = 1, ZFS_VOLMODE_DEV = 2, ZFS_VOLMODE_NONE = 3 } zfs_volmode_t; typedef enum { ZFS_DIRECT_DISABLED = 0, ZFS_DIRECT_STANDARD, ZFS_DIRECT_ALWAYS } zfs_direct_t; typedef enum zfs_keystatus { ZFS_KEYSTATUS_NONE = 0, ZFS_KEYSTATUS_UNAVAILABLE, ZFS_KEYSTATUS_AVAILABLE, } zfs_keystatus_t; typedef enum zfs_keyformat { ZFS_KEYFORMAT_NONE = 0, ZFS_KEYFORMAT_RAW, ZFS_KEYFORMAT_HEX, ZFS_KEYFORMAT_PASSPHRASE, ZFS_KEYFORMAT_FORMATS } zfs_keyformat_t; typedef enum zfs_key_location { ZFS_KEYLOCATION_NONE = 0, ZFS_KEYLOCATION_PROMPT, ZFS_KEYLOCATION_URI, ZFS_KEYLOCATION_LOCATIONS } zfs_keylocation_t; typedef enum { ZFS_PREFETCH_NONE = 0, ZFS_PREFETCH_METADATA = 1, ZFS_PREFETCH_ALL = 2 } zfs_prefetch_type_t; #define DEFAULT_PBKDF2_ITERATIONS 350000 #define MIN_PBKDF2_ITERATIONS 100000 /* * On-disk version number. */ #define SPA_VERSION_1 1ULL #define SPA_VERSION_2 2ULL #define SPA_VERSION_3 3ULL #define SPA_VERSION_4 4ULL #define SPA_VERSION_5 5ULL #define SPA_VERSION_6 6ULL #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL #define SPA_VERSION_10 10ULL #define SPA_VERSION_11 11ULL #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL #define SPA_VERSION_15 15ULL #define SPA_VERSION_16 16ULL #define SPA_VERSION_17 17ULL #define SPA_VERSION_18 18ULL #define SPA_VERSION_19 19ULL #define SPA_VERSION_20 20ULL #define SPA_VERSION_21 21ULL #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL #define SPA_VERSION_25 25ULL #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL #define SPA_VERSION_5000 5000ULL /* * The incrementing pool version number has been replaced by pool feature * flags. For more details, see zfeature.c. */ #define SPA_VERSION SPA_VERSION_5000 #define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. * Used in the code when checking for presence or absence of a feature. * Feel free to define multiple symbolic names for each version if there * were multiple changes to on-disk structures during that version. * * NOTE: When checking the current SPA_VERSION in your code, be sure * to use spa_version() since it reports the version of the * last synced uberblock. Checking the in-flight version can * be dangerous in some cases. */ #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 #define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 #define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 #define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 #define SPA_VERSION_BOOTFS SPA_VERSION_6 #define SPA_VERSION_SLOGS SPA_VERSION_7 #define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 #define SPA_VERSION_FUID SPA_VERSION_9 #define SPA_VERSION_REFRESERVATION SPA_VERSION_9 #define SPA_VERSION_REFQUOTA SPA_VERSION_9 #define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 #define SPA_VERSION_L2CACHE SPA_VERSION_10 #define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 #define SPA_VERSION_ORIGIN SPA_VERSION_11 #define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_USERSPACE SPA_VERSION_15 #define SPA_VERSION_STMF_PROP SPA_VERSION_16 #define SPA_VERSION_RAIDZ3 SPA_VERSION_17 #define SPA_VERSION_USERREFS SPA_VERSION_18 #define SPA_VERSION_HOLES SPA_VERSION_19 #define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 #define SPA_VERSION_DEDUP SPA_VERSION_21 #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 #define SPA_VERSION_SCAN SPA_VERSION_25 #define SPA_VERSION_DIR_CLONES SPA_VERSION_26 #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 #define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 #define SPA_VERSION_FEATURES SPA_VERSION_5000 #define SPA_VERSION_IS_SUPPORTED(v) \ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change * occurs. This is independent of SPA/DMU/ZAP versioning. You must * also update the version_table[] and help message in zfs_prop.c. */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL #define ZPL_VERSION_4 4ULL #define ZPL_VERSION_5 5ULL #define ZPL_VERSION ZPL_VERSION_5 #define ZPL_VERSION_STRING "5" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 /* Persistent L2ARC version */ #define L2ARC_PERSISTENT_VERSION_1 1ULL #define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1 #define L2ARC_PERSISTENT_VERSION_STRING "1" /* Rewind policy information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ #define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ #define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ #define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ #define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ #define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ typedef struct zpool_load_policy { uint32_t zlp_rewind; /* rewind policy requested */ uint64_t zlp_maxmeta; /* max acceptable meta-data errors */ uint64_t zlp_maxdata; /* max acceptable data errors */ uint64_t zlp_txg; /* specific txg to load */ } zpool_load_policy_t; /* * The following are configuration names used in the nvlist describing a pool's * configuration. New on-disk names should be prefixed with ":" * (e.g. "org.openzfs:") to avoid conflicting names being developed * independently. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" #define ZPOOL_CONFIG_POOL_STATE "state" #define ZPOOL_CONFIG_POOL_TXG "txg" #define ZPOOL_CONFIG_POOL_GUID "pool_guid" #define ZPOOL_CONFIG_CREATE_TXG "create_txg" #define ZPOOL_CONFIG_TOP_GUID "top_guid" #define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" #define ZPOOL_CONFIG_TYPE "type" #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" #define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" #define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" #define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_SPARE_ID "spareid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ #define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ /* container nvlist of extended stats */ #define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex" /* Active queue read/write stats */ #define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue" #define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" #define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue" #define ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE "vdev_rebuild_active_queue" /* Queue sizes */ #define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" #define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" #define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue" #define ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE "vdev_rebuild_pend_queue" /* Latency read/write histogram stats */ #define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" #define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo" #define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo" #define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo" #define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo" #define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo" #define ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO "vdev_rebuild_histo" /* Request size histograms */ #define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo" #define ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO "vdev_sync_ind_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo" #define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo" #define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo" #define ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO "vdev_ind_rebuild_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" #define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" #define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo" #define ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO "vdev_agg_rebuild_histo" /* Number of slow IOs */ #define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" /* Number of Direct I/O write verify errors */ #define ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS "vdev_dio_verify_errors" /* vdev enclosure sysfs path */ #define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_RAIDZ_EXPANDING "raidz_expanding" #define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" #define ZPOOL_CONFIG_UNSPARE "unspare" #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_L2CACHE "l2cache" #define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" #define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" #define ZPOOL_CONFIG_IS_HOLE "is_hole" #define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" #define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" #define ZPOOL_CONFIG_DDT_STATS "ddt_stats" #define ZPOOL_CONFIG_SPLIT "splitcfg" #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_NONALLOCATING "non_allocating" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" #define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ #define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ #define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ #define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ #define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ #define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_ROOT_ZAP "com.klarasystems:vdev_zap_root" #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" #define ZPOOL_CONFIG_RESILVER_DEFER "com.datto:resilver_defer" #define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ #define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ #define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */ #define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" #define ZPOOL_CONFIG_COMPATIBILITY "compatibility" /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such * as offline and degraded. */ #define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_FRU "fru" #define ZPOOL_CONFIG_AUX_STATE "aux_state" /* Pool load policy parameters */ #define ZPOOL_LOAD_POLICY "load-policy" #define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy" #define ZPOOL_LOAD_REQUEST_TXG "load-request-txg" #define ZPOOL_LOAD_META_THRESH "load-meta-thresh" #define ZPOOL_LOAD_DATA_THRESH "load-data-thresh" /* Rewind data discovered */ #define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" #define ZPOOL_CONFIG_LOAD_META_ERRORS "verify_meta_errors" #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" /* dRAID configuration */ #define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata" #define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" #define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" #define VDEV_TYPE_DRAID "draid" #define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" #define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_DRAID_MAXPARITY 3 #define VDEV_DRAID_MIN_CHILDREN 2 #define VDEV_DRAID_MAX_CHILDREN UINT8_MAX /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ #define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ "com.delphix:indirect_obsolete_sm" #define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ "com.delphix:obsolete_counts_are_precise" #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" #define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \ "com.delphix:ms_unflushed_phys_txgs" #define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \ "org.openzfs:vdev_rebuild" #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE \ "org.openzfs:raidz_expand_state" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME \ "org.openzfs:raidz_expand_start_time" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME \ "org.openzfs:raidz_expand_end_time" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED \ "org.openzfs:raidz_expand_bytes_copied" /* vdev metaslab allocation bias */ #define VDEV_ALLOC_BIAS_LOG "log" #define VDEV_ALLOC_BIAS_SPECIAL "special" #define VDEV_ALLOC_BIAS_DEDUP "dedup" /* vdev initialize state */ #define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ "com.delphix:next_offset_to_initialize" #define VDEV_LEAF_ZAP_INITIALIZE_STATE \ "com.delphix:vdev_initialize_state" #define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ "com.delphix:vdev_initialize_action_time" /* vdev TRIM state */ #define VDEV_LEAF_ZAP_TRIM_LAST_OFFSET \ "org.zfsonlinux:next_offset_to_trim" #define VDEV_LEAF_ZAP_TRIM_STATE \ "org.zfsonlinux:vdev_trim_state" #define VDEV_LEAF_ZAP_TRIM_ACTION_TIME \ "org.zfsonlinux:vdev_trim_action_time" #define VDEV_LEAF_ZAP_TRIM_RATE \ "org.zfsonlinux:vdev_trim_rate" #define VDEV_LEAF_ZAP_TRIM_PARTIAL \ "org.zfsonlinux:vdev_trim_partial" #define VDEV_LEAF_ZAP_TRIM_SECURE \ "org.zfsonlinux:vdev_trim_secure" /* * This is needed in userland to report the minimum necessary device size. */ #define SPA_MINDEVSIZE (64ULL << 20) /* * Set if the fragmentation has not yet been calculated. This can happen * because the space maps have not been upgraded or the histogram feature * is not enabled. */ #define ZFS_FRAG_INVALID UINT64_MAX /* * The location of the pool configuration repository, shared between kernel and * userland. */ #define ZPOOL_CACHE_BOOT "/boot/zfs/zpool.cache" #define ZPOOL_CACHE "/etc/zfs/zpool.cache" /* * Settings for zpool compatibility features files */ #define ZPOOL_SYSCONF_COMPAT_D SYSCONFDIR "/zfs/compatibility.d" #define ZPOOL_DATA_COMPAT_D PKGDATADIR "/compatibility.d" #define ZPOOL_COMPAT_MAXSIZE 16384 /* * Hard-wired compatibility settings */ #define ZPOOL_COMPAT_LEGACY "legacy" #define ZPOOL_COMPAT_OFF "off" /* * vdev states are ordered from least to most healthy. * A vdev that's CANT_OPEN or below is considered unusable. */ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; #define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. */ typedef enum vdev_aux { VDEV_AUX_NONE, /* no error */ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ VDEV_AUX_TOO_SMALL, /* vdev size is too small */ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ VDEV_AUX_EXTERNAL, /* external diagnosis or forced fault */ VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */ VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ VDEV_AUX_ACTIVE, /* vdev active on a different host */ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ } vdev_aux_t; /* * pool state. The following states are written to disk as part of the normal * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining * states are software abstractions used at various levels to communicate * pool state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ } pool_state_t; /* * mmp state. The following states provide additional detail describing * why a pool couldn't be safely imported. */ typedef enum mmp_state { MMP_STATE_ACTIVE = 0, /* In active use */ MMP_STATE_INACTIVE, /* Inactive and safe to import */ MMP_STATE_NO_HOSTID /* System hostid is not set */ } mmp_state_t; /* * Scan Functions. */ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, POOL_SCAN_ERRORSCRUB, POOL_SCAN_FUNCS } pool_scan_func_t; /* * Used to control scrub pause and resume. */ typedef enum pool_scrub_cmd { POOL_SCRUB_NORMAL = 0, POOL_SCRUB_PAUSE, POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; typedef enum { CS_NONE, CS_CHECKPOINT_EXISTS, CS_CHECKPOINT_DISCARDING, CS_NUM_STATES } checkpoint_state_t; typedef struct pool_checkpoint_stat { uint64_t pcs_state; /* checkpoint_state_t */ uint64_t pcs_start_time; /* time checkpoint/discard started */ uint64_t pcs_space; /* checkpointed space */ } pool_checkpoint_stat_t; /* * ZIO types. Needed to interpret vdev statistics below. */ typedef enum zio_type { ZIO_TYPE_NULL = 0, ZIO_TYPE_READ, ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, ZIO_TYPE_FLUSH, ZIO_TYPE_TRIM, ZIO_TYPES } zio_type_t; /* * Compatibility: _IOCTL was renamed to _FLUSH; keep the old name available to * user programs. */ #define ZIO_TYPE_IOCTL ZIO_TYPE_FLUSH /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct pool_scan_stat { /* values stored on disk */ uint64_t pss_func; /* pool_scan_func_t */ uint64_t pss_state; /* dsl_scan_state_t */ uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total bytes located by scanner */ uint64_t pss_skipped; /* total bytes skipped by scanner */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ uint64_t pss_pass_scrub_pause; /* pause time of a scrub pass */ /* cumulative time scrub spent paused, needed for rate calculation */ uint64_t pss_pass_scrub_spent_paused; uint64_t pss_pass_issued; /* issued bytes per scan pass */ uint64_t pss_issued; /* total bytes checked by scanner */ /* error scrub values stored on disk */ uint64_t pss_error_scrub_func; /* pool_scan_func_t */ uint64_t pss_error_scrub_state; /* dsl_scan_state_t */ uint64_t pss_error_scrub_start; /* error scrub start time */ uint64_t pss_error_scrub_end; /* error scrub end time */ uint64_t pss_error_scrub_examined; /* error blocks issued I/O */ /* error blocks to be issued I/O */ uint64_t pss_error_scrub_to_be_examined; /* error scrub values not stored on disk */ /* error scrub pause time in milliseconds */ uint64_t pss_pass_error_scrub_pause; } pool_scan_stat_t; typedef struct pool_removal_stat { uint64_t prs_state; /* dsl_scan_state_t */ uint64_t prs_removing_vdev; uint64_t prs_start_time; uint64_t prs_end_time; uint64_t prs_to_copy; /* bytes that need to be copied */ uint64_t prs_copied; /* bytes copied so far */ /* * bytes of memory used for indirect mappings. * This includes all removed vdevs. */ uint64_t prs_mapping_memory; } pool_removal_stat_t; typedef struct pool_raidz_expand_stat { uint64_t pres_state; /* dsl_scan_state_t */ uint64_t pres_expanding_vdev; uint64_t pres_start_time; uint64_t pres_end_time; uint64_t pres_to_reflow; /* bytes that need to be moved */ uint64_t pres_reflowed; /* bytes moved so far */ uint64_t pres_waiting_for_resilver; } pool_raidz_expand_stat_t; typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, DSS_ERRORSCRUBBING, DSS_NUM_STATES } dsl_scan_state_t; typedef struct vdev_rebuild_stat { uint64_t vrs_state; /* vdev_rebuild_state_t */ uint64_t vrs_start_time; /* time_t */ uint64_t vrs_end_time; /* time_t */ uint64_t vrs_scan_time_ms; /* total run time (millisecs) */ uint64_t vrs_bytes_scanned; /* allocated bytes scanned */ uint64_t vrs_bytes_issued; /* read bytes issued */ uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */ uint64_t vrs_bytes_est; /* total bytes to scan */ uint64_t vrs_errors; /* scanning errors */ uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */ } vdev_rebuild_stat_t; /* * Errata described by https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-ER. * The ordering of this enum must be maintained to ensure the errata identifiers * map to the correct documentation. New errata may only be appended to the * list and must contain corresponding documentation at the above link. */ typedef enum zpool_errata { ZPOOL_ERRATA_NONE, ZPOOL_ERRATA_ZOL_2094_SCRUB, ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY, ZPOOL_ERRATA_ZOL_6845_ENCRYPTION, ZPOOL_ERRATA_ZOL_8308_ENCRYPTION, } zpool_errata_t; /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and user land as an nvlist uint64 array. * * The vs_ops[] and vs_bytes[] arrays must always be an array size of 6 in * order to keep subsequent members at their known fixed offsets. When * adding a new field it must be added to the end the structure. */ #define VS_ZIO_TYPES 6 typedef struct vdev_stat { hrtime_t vs_timestamp; /* time since vdev load */ uint64_t vs_state; /* vdev state */ uint64_t vs_aux; /* see vdev_aux_t */ uint64_t vs_alloc; /* space allocated */ uint64_t vs_space; /* total capacity */ uint64_t vs_dspace; /* deflated capacity */ uint64_t vs_rsize; /* replaceable dev size */ uint64_t vs_esize; /* expandable dev size */ uint64_t vs_ops[VS_ZIO_TYPES]; /* operation count */ uint64_t vs_bytes[VS_ZIO_TYPES]; /* bytes read/written */ uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_initialize_errors; /* initializing errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ uint64_t vs_initialize_bytes_done; /* bytes initialized */ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ uint64_t vs_initialize_state; /* vdev_initializing_state_t */ uint64_t vs_initialize_action_time; /* time_t */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ uint64_t vs_resilver_deferred; /* resilver deferred */ uint64_t vs_slow_ios; /* slow IOs */ uint64_t vs_trim_errors; /* trimming errors */ uint64_t vs_trim_notsup; /* supported by device */ uint64_t vs_trim_bytes_done; /* bytes trimmed */ uint64_t vs_trim_bytes_est; /* total bytes to trim */ uint64_t vs_trim_state; /* vdev_trim_state_t */ uint64_t vs_trim_action_time; /* time_t */ uint64_t vs_rebuild_processed; /* bytes rebuilt */ uint64_t vs_configured_ashift; /* TLV vdev_ashift */ uint64_t vs_logical_ashift; /* vdev_logical_ashift */ uint64_t vs_physical_ashift; /* vdev_physical_ashift */ uint64_t vs_noalloc; /* allocations halted? */ uint64_t vs_pspace; /* physical capacity */ uint64_t vs_dio_verify_errors; /* DIO write verify errors */ } vdev_stat_t; #define VDEV_STAT_VALID(field, uint64_t_field_count) \ ((uint64_t_field_count * sizeof (uint64_t)) >= \ (offsetof(vdev_stat_t, field) + sizeof (((vdev_stat_t *)NULL)->field))) /* * Extended stats * * These are stats which aren't included in the original iostat output. For * convenience, they are grouped together in vdev_stat_ex, although each stat * is individually exported as an nvlist. */ typedef struct vdev_stat_ex { /* Number of ZIOs issued to disk and waiting to finish */ uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; /* Number of ZIOs pending to be issued to disk */ uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; /* * Below are the histograms for various latencies. Buckets are in * units of nanoseconds. */ /* * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in * before this. */ #define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */ #define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */ /* Amount of time in ZIO queue (ns) */ uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_L_HISTO_BUCKETS]; /* Total ZIO latency (ns). Includes queuing and disk access time */ uint64_t vsx_total_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; /* Amount of time to read/write the disk (ns) */ uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; /* "lookup the bucket for a value" histogram macros */ #define HISTO(val, buckets) (val != 0 ? MIN(highbit64(val) - 1, \ buckets - 1) : 0) #define L_HISTO(a) HISTO(a, VDEV_L_HISTO_BUCKETS) #define RQ_HISTO(a) HISTO(a, VDEV_RQ_HISTO_BUCKETS) /* Physical IO histogram */ uint64_t vsx_ind_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_RQ_HISTO_BUCKETS]; /* Delegated (aggregated) physical IO histogram */ uint64_t vsx_agg_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_RQ_HISTO_BUCKETS]; } vdev_stat_ex_t; /* * Initialize functions. */ typedef enum pool_initialize_func { POOL_INITIALIZE_START, POOL_INITIALIZE_CANCEL, POOL_INITIALIZE_SUSPEND, POOL_INITIALIZE_UNINIT, POOL_INITIALIZE_FUNCS } pool_initialize_func_t; /* * TRIM functions. */ typedef enum pool_trim_func { POOL_TRIM_START, POOL_TRIM_CANCEL, POOL_TRIM_SUSPEND, POOL_TRIM_FUNCS } pool_trim_func_t; /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct ddt_object { uint64_t ddo_count; /* number of elements in ddt */ uint64_t ddo_dspace; /* size of ddt on disk */ uint64_t ddo_mspace; /* size of ddt in-core */ } ddt_object_t; typedef struct ddt_stat { uint64_t dds_blocks; /* blocks */ uint64_t dds_lsize; /* logical size */ uint64_t dds_psize; /* physical size */ uint64_t dds_dsize; /* deflated allocated size */ uint64_t dds_ref_blocks; /* referenced blocks */ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ uint64_t dds_ref_psize; /* referenced psize * refcnt */ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ } ddt_stat_t; typedef struct ddt_histogram { ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ } ddt_histogram_t; #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV "/dev/zfs" #define ZFS_DEVDIR "/dev" #define ZFS_SUPER_MAGIC 0x2fc12fc1 /* general zvol path */ #define ZVOL_DIR "/dev/zvol/" #define ZVOL_MAJOR 230 #define ZVOL_MINOR_BITS 4 #define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) #define ZVOL_MINORS (1 << 4) #define ZVOL_DEV_NAME "zd" #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 16384 typedef enum { VDEV_INITIALIZE_NONE, VDEV_INITIALIZE_ACTIVE, VDEV_INITIALIZE_CANCELED, VDEV_INITIALIZE_SUSPENDED, VDEV_INITIALIZE_COMPLETE } vdev_initializing_state_t; typedef enum { VDEV_TRIM_NONE, VDEV_TRIM_ACTIVE, VDEV_TRIM_CANCELED, VDEV_TRIM_SUSPENDED, VDEV_TRIM_COMPLETE, } vdev_trim_state_t; typedef enum { VDEV_REBUILD_NONE, VDEV_REBUILD_ACTIVE, VDEV_REBUILD_CANCELED, VDEV_REBUILD_COMPLETE, } vdev_rebuild_state_t; /* * nvlist name constants. Facilitate restricting snapshot iteration range for * the "list next snapshot" ioctl */ #define SNAP_ITER_MIN_TXG "snap_iter_min_txg" #define SNAP_ITER_MAX_TXG "snap_iter_max_txg" /* * /dev/zfs ioctl numbers. * * These numbers cannot change over time. New ioctl numbers must be appended. */ typedef enum zfs_ioc { /* * Core features - 89/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, #else ZFS_IOC_FIRST = ('Z' << 8), #endif ZFS_IOC = ZFS_IOC_FIRST, ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, /* 0x5a00 */ ZFS_IOC_POOL_DESTROY, /* 0x5a01 */ ZFS_IOC_POOL_IMPORT, /* 0x5a02 */ ZFS_IOC_POOL_EXPORT, /* 0x5a03 */ ZFS_IOC_POOL_CONFIGS, /* 0x5a04 */ ZFS_IOC_POOL_STATS, /* 0x5a05 */ ZFS_IOC_POOL_TRYIMPORT, /* 0x5a06 */ ZFS_IOC_POOL_SCAN, /* 0x5a07 */ ZFS_IOC_POOL_FREEZE, /* 0x5a08 */ ZFS_IOC_POOL_UPGRADE, /* 0x5a09 */ ZFS_IOC_POOL_GET_HISTORY, /* 0x5a0a */ ZFS_IOC_VDEV_ADD, /* 0x5a0b */ ZFS_IOC_VDEV_REMOVE, /* 0x5a0c */ ZFS_IOC_VDEV_SET_STATE, /* 0x5a0d */ ZFS_IOC_VDEV_ATTACH, /* 0x5a0e */ ZFS_IOC_VDEV_DETACH, /* 0x5a0f */ ZFS_IOC_VDEV_SETPATH, /* 0x5a10 */ ZFS_IOC_VDEV_SETFRU, /* 0x5a11 */ ZFS_IOC_OBJSET_STATS, /* 0x5a12 */ ZFS_IOC_OBJSET_ZPLPROPS, /* 0x5a13 */ ZFS_IOC_DATASET_LIST_NEXT, /* 0x5a14 */ ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x5a15 */ ZFS_IOC_SET_PROP, /* 0x5a16 */ ZFS_IOC_CREATE, /* 0x5a17 */ ZFS_IOC_DESTROY, /* 0x5a18 */ ZFS_IOC_ROLLBACK, /* 0x5a19 */ ZFS_IOC_RENAME, /* 0x5a1a */ ZFS_IOC_RECV, /* 0x5a1b */ ZFS_IOC_SEND, /* 0x5a1c */ ZFS_IOC_INJECT_FAULT, /* 0x5a1d */ ZFS_IOC_CLEAR_FAULT, /* 0x5a1e */ ZFS_IOC_INJECT_LIST_NEXT, /* 0x5a1f */ ZFS_IOC_ERROR_LOG, /* 0x5a20 */ ZFS_IOC_CLEAR, /* 0x5a21 */ ZFS_IOC_PROMOTE, /* 0x5a22 */ ZFS_IOC_SNAPSHOT, /* 0x5a23 */ ZFS_IOC_DSOBJ_TO_DSNAME, /* 0x5a24 */ ZFS_IOC_OBJ_TO_PATH, /* 0x5a25 */ ZFS_IOC_POOL_SET_PROPS, /* 0x5a26 */ ZFS_IOC_POOL_GET_PROPS, /* 0x5a27 */ ZFS_IOC_SET_FSACL, /* 0x5a28 */ ZFS_IOC_GET_FSACL, /* 0x5a29 */ ZFS_IOC_SHARE, /* 0x5a2a */ ZFS_IOC_INHERIT_PROP, /* 0x5a2b */ ZFS_IOC_SMB_ACL, /* 0x5a2c */ ZFS_IOC_USERSPACE_ONE, /* 0x5a2d */ ZFS_IOC_USERSPACE_MANY, /* 0x5a2e */ ZFS_IOC_USERSPACE_UPGRADE, /* 0x5a2f */ ZFS_IOC_HOLD, /* 0x5a30 */ ZFS_IOC_RELEASE, /* 0x5a31 */ ZFS_IOC_GET_HOLDS, /* 0x5a32 */ ZFS_IOC_OBJSET_RECVD_PROPS, /* 0x5a33 */ ZFS_IOC_VDEV_SPLIT, /* 0x5a34 */ ZFS_IOC_NEXT_OBJ, /* 0x5a35 */ ZFS_IOC_DIFF, /* 0x5a36 */ ZFS_IOC_TMP_SNAPSHOT, /* 0x5a37 */ ZFS_IOC_OBJ_TO_STATS, /* 0x5a38 */ ZFS_IOC_SPACE_WRITTEN, /* 0x5a39 */ ZFS_IOC_SPACE_SNAPS, /* 0x5a3a */ ZFS_IOC_DESTROY_SNAPS, /* 0x5a3b */ ZFS_IOC_POOL_REGUID, /* 0x5a3c */ ZFS_IOC_POOL_REOPEN, /* 0x5a3d */ ZFS_IOC_SEND_PROGRESS, /* 0x5a3e */ ZFS_IOC_LOG_HISTORY, /* 0x5a3f */ ZFS_IOC_SEND_NEW, /* 0x5a40 */ ZFS_IOC_SEND_SPACE, /* 0x5a41 */ ZFS_IOC_CLONE, /* 0x5a42 */ ZFS_IOC_BOOKMARK, /* 0x5a43 */ ZFS_IOC_GET_BOOKMARKS, /* 0x5a44 */ ZFS_IOC_DESTROY_BOOKMARKS, /* 0x5a45 */ ZFS_IOC_RECV_NEW, /* 0x5a46 */ ZFS_IOC_POOL_SYNC, /* 0x5a47 */ ZFS_IOC_CHANNEL_PROGRAM, /* 0x5a48 */ ZFS_IOC_LOAD_KEY, /* 0x5a49 */ ZFS_IOC_UNLOAD_KEY, /* 0x5a4a */ ZFS_IOC_CHANGE_KEY, /* 0x5a4b */ ZFS_IOC_REMAP, /* 0x5a4c */ ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */ ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */ ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */ ZFS_IOC_POOL_TRIM, /* 0x5a50 */ ZFS_IOC_REDACT, /* 0x5a51 */ ZFS_IOC_GET_BOOKMARK_PROPS, /* 0x5a52 */ ZFS_IOC_WAIT, /* 0x5a53 */ ZFS_IOC_WAIT_FS, /* 0x5a54 */ ZFS_IOC_VDEV_GET_PROPS, /* 0x5a55 */ ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ ZFS_IOC_DDT_PRUNE, /* 0x5a59 */ /* * Per-platform (Optional) - 8/128 numbers reserved. */ ZFS_IOC_PLATFORM = ZFS_IOC_FIRST + 0x80, ZFS_IOC_EVENTS_NEXT, /* 0x81 (Linux) */ ZFS_IOC_EVENTS_CLEAR, /* 0x82 (Linux) */ ZFS_IOC_EVENTS_SEEK, /* 0x83 (Linux) */ ZFS_IOC_NEXTBOOT, /* 0x84 (FreeBSD) */ ZFS_IOC_JAIL, /* 0x85 (FreeBSD) */ ZFS_IOC_USERNS_ATTACH = ZFS_IOC_JAIL, /* 0x85 (Linux) */ ZFS_IOC_UNJAIL, /* 0x86 (FreeBSD) */ ZFS_IOC_USERNS_DETACH = ZFS_IOC_UNJAIL, /* 0x86 (Linux) */ ZFS_IOC_SET_BOOTENV, /* 0x87 */ ZFS_IOC_GET_BOOTENV, /* 0x88 */ ZFS_IOC_LAST } zfs_ioc_t; /* * zvol ioctl to get dataset name */ #define BLKZNAME _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN]) #ifdef __linux__ /* * IOCTLs to update and retrieve additional file level attributes on * Linux. */ #define ZFS_IOC_GETDOSFLAGS _IOR(0x83, 1, uint64_t) #define ZFS_IOC_SETDOSFLAGS _IOW(0x83, 2, uint64_t) /* * Additional file level attributes, that are stored * in the upper half of z_pflags */ #define ZFS_READONLY 0x0000000100000000ull #define ZFS_HIDDEN 0x0000000200000000ull #define ZFS_SYSTEM 0x0000000400000000ull #define ZFS_ARCHIVE 0x0000000800000000ull #define ZFS_IMMUTABLE 0x0000001000000000ull #define ZFS_NOUNLINK 0x0000002000000000ull #define ZFS_APPENDONLY 0x0000004000000000ull #define ZFS_NODUMP 0x0000008000000000ull #define ZFS_OPAQUE 0x0000010000000000ull #define ZFS_AV_QUARANTINED 0x0000020000000000ull #define ZFS_AV_MODIFIED 0x0000040000000000ull #define ZFS_REPARSE 0x0000080000000000ull #define ZFS_OFFLINE 0x0000100000000000ull #define ZFS_SPARSE 0x0000200000000000ull #define ZFS_DOS_FL_USER_VISIBLE (ZFS_IMMUTABLE | ZFS_APPENDONLY | \ ZFS_NOUNLINK | ZFS_ARCHIVE | ZFS_NODUMP | ZFS_SYSTEM | \ ZFS_HIDDEN | ZFS_READONLY | ZFS_REPARSE | ZFS_OFFLINE | \ ZFS_SPARSE) #endif /* * ZFS-specific error codes used for returning descriptive errors * to the userland through zfs ioctls. * * The enum implicitly includes all the error codes from errno.h. * New code should use and extend this enum for errors that are * not described precisely by generic errno codes. * * These numbers should not change over time. New entries should be appended. * * (Keep in sync with contrib/pyzfs/libzfs_core/_constants.py) */ typedef enum { ZFS_ERR_CHECKPOINT_EXISTS = 1024, ZFS_ERR_DISCARDING_CHECKPOINT, ZFS_ERR_NO_CHECKPOINT, ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_IOC_CMD_UNAVAIL, ZFS_ERR_IOC_ARG_UNAVAIL, ZFS_ERR_IOC_ARG_REQUIRED, ZFS_ERR_IOC_ARG_BADTYPE, ZFS_ERR_WRONG_PARENT, ZFS_ERR_FROM_IVSET_GUID_MISSING, ZFS_ERR_FROM_IVSET_GUID_MISMATCH, ZFS_ERR_SPILL_BLOCK_FLAG_MISSING, ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE, ZFS_ERR_EXPORT_IN_PROGRESS, ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR, ZFS_ERR_STREAM_TRUNCATED, ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH, ZFS_ERR_RESILVER_IN_PROGRESS, ZFS_ERR_REBUILD_IN_PROGRESS, ZFS_ERR_BADPROP, ZFS_ERR_VDEV_NOTSUP, ZFS_ERR_NOT_USER_NAMESPACE, ZFS_ERR_RESUME_EXISTS, ZFS_ERR_CRYPTO_NOTSUP, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, ZFS_ERR_ASHIFT_MISMATCH, + ZFS_ERR_STREAM_LARGE_MICROZAP, } zfs_errno_t; /* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { SPA_LOAD_NONE, /* no load in progress */ SPA_LOAD_OPEN, /* normal open */ SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ SPA_LOAD_ERROR, /* load failed */ SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; typedef enum { ZPOOL_WAIT_CKPT_DISCARD, ZPOOL_WAIT_FREE, ZPOOL_WAIT_INITIALIZE, ZPOOL_WAIT_REPLACE, ZPOOL_WAIT_REMOVE, ZPOOL_WAIT_RESILVER, ZPOOL_WAIT_SCRUB, ZPOOL_WAIT_TRIM, ZPOOL_WAIT_RAIDZ_EXPAND, ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; typedef enum { ZFS_WAIT_DELETEQ, ZFS_WAIT_NUM_ACTIVITIES } zfs_wait_activity_t; typedef enum { ZPOOL_PREFETCH_NONE = 0, ZPOOL_PREFETCH_DDT } zpool_prefetch_type_t; typedef enum { ZPOOL_DDT_PRUNE_NONE, ZPOOL_DDT_PRUNE_AGE, /* in seconds */ ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */ } zpool_ddt_prune_unit_t; /* * Bookmark name values. */ #define ZPOOL_ERR_LIST "error list" #define ZPOOL_ERR_DATASET "dataset" #define ZPOOL_ERR_OBJECT "object" #define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) /* * The following are names used in the nvlist describing * the pool's history log. */ #define ZPOOL_HIST_RECORD "history record" #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" #define ZPOOL_HIST_WHO "history who" #define ZPOOL_HIST_ZONE "history zone" #define ZPOOL_HIST_HOST "history hostname" #define ZPOOL_HIST_TXG "history txg" #define ZPOOL_HIST_INT_EVENT "history internal event" #define ZPOOL_HIST_INT_STR "history internal str" #define ZPOOL_HIST_INT_NAME "internal_name" #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" #define ZPOOL_HIST_OUTPUT_SIZE "out_size" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" #define ZPOOL_HIST_ERRNO "errno" #define ZPOOL_HIST_ELAPSED_NS "elapsed_ns" /* * Special nvlist name that will not have its args recorded in the pool's * history log. */ #define ZPOOL_HIDDEN_ARGS "hidden_args" /* * The following is used when invoking ZFS_IOC_POOL_GET_PROPS. */ #define ZPOOL_GET_PROPS_NAMES "get_props_names" /* * Opt-in property names used with ZPOOL_GET_PROPS_NAMES. * For example, properties that are hidden or expensive to compute. */ #define ZPOOL_DEDUPCACHED_PROP_NAME "dedupcached" /* * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. */ #define ZPOOL_INITIALIZE_COMMAND "initialize_command" #define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" /* * The following are names used when invoking ZFS_IOC_POOL_REGUID. */ #define ZPOOL_REGUID_GUID "guid" /* * The following are names used when invoking ZFS_IOC_POOL_TRIM. */ #define ZPOOL_TRIM_COMMAND "trim_command" #define ZPOOL_TRIM_VDEVS "trim_vdevs" #define ZPOOL_TRIM_RATE "trim_rate" #define ZPOOL_TRIM_SECURE "trim_secure" /* * The following are names used when invoking ZFS_IOC_POOL_WAIT. */ #define ZPOOL_WAIT_ACTIVITY "wait_activity" #define ZPOOL_WAIT_TAG "wait_tag" #define ZPOOL_WAIT_WAITED "wait_waited" /* * The following are names used when invoking ZFS_IOC_VDEV_GET_PROP. */ #define ZPOOL_VDEV_PROPS_GET_VDEV "vdevprops_get_vdev" #define ZPOOL_VDEV_PROPS_GET_PROPS "vdevprops_get_props" /* * The following are names used when invoking ZFS_IOC_VDEV_SET_PROP. */ #define ZPOOL_VDEV_PROPS_SET_VDEV "vdevprops_set_vdev" #define ZPOOL_VDEV_PROPS_SET_PROPS "vdevprops_set_props" /* * The following are names used when invoking ZFS_IOC_WAIT_FS. */ #define ZFS_WAIT_ACTIVITY "wait_activity" #define ZFS_WAIT_WAITED "wait_waited" /* * The following are names used when invoking ZFS_IOC_POOL_PREFETCH. */ #define ZPOOL_PREFETCH_TYPE "prefetch_type" /* * The following are names used when invoking ZFS_IOC_DDT_PRUNE. */ #define DDT_PRUNE_UNIT "ddt_prune_unit" #define DDT_PRUNE_AMOUNT "ddt_prune_amount" /* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_EXPAND 0x8 #define ZFS_ONLINE_SPARE 0x10 #define ZFS_OFFLINE_TEMPORARY 0x1 /* * Flags for ZFS_IOC_POOL_IMPORT */ #define ZFS_IMPORT_NORMAL 0x0 #define ZFS_IMPORT_VERBATIM 0x1 #define ZFS_IMPORT_ANY_HOST 0x2 #define ZFS_IMPORT_MISSING_LOG 0x4 #define ZFS_IMPORT_ONLY 0x8 #define ZFS_IMPORT_TEMP_NAME 0x10 #define ZFS_IMPORT_SKIP_MMP 0x20 #define ZFS_IMPORT_LOAD_KEYS 0x40 #define ZFS_IMPORT_CHECKPOINT 0x80 /* * Channel program argument/return nvlist keys and defaults. */ #define ZCP_ARG_PROGRAM "program" #define ZCP_ARG_ARGLIST "arg" #define ZCP_ARG_SYNC "sync" #define ZCP_ARG_INSTRLIMIT "instrlimit" #define ZCP_ARG_MEMLIMIT "memlimit" #define ZCP_ARG_CLIARGV "argv" #define ZCP_RET_ERROR "error" #define ZCP_RET_RETURN "return" #define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000) #define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT) #define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024) #define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT) /* * Sysevent payload members. ZFS will generate the following sysevents with the * given payloads: * * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_FINISH * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING * * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * * ESC_ZFS_VDEV_REMOVE * ESC_ZFS_VDEV_CLEAR * ESC_ZFS_VDEV_CHECK * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 * * ESC_ZFS_HISTORY_EVENT * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional) * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional) * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional) * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional) * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional) * * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the * history log nvlist. The keynames will be free of any spaces or other * characters that could be potentially unexpected to consumers of the * sysevents. */ #define ZFS_EV_POOL_NAME "pool_name" #define ZFS_EV_POOL_GUID "pool_guid" #define ZFS_EV_VDEV_PATH "vdev_path" #define ZFS_EV_VDEV_GUID "vdev_guid" #define ZFS_EV_HIST_TIME "history_time" #define ZFS_EV_HIST_CMD "history_command" #define ZFS_EV_HIST_WHO "history_who" #define ZFS_EV_HIST_ZONE "history_zone" #define ZFS_EV_HIST_HOST "history_hostname" #define ZFS_EV_HIST_TXG "history_txg" #define ZFS_EV_HIST_INT_EVENT "history_internal_event" #define ZFS_EV_HIST_INT_STR "history_internal_str" #define ZFS_EV_HIST_INT_NAME "history_internal_name" #define ZFS_EV_HIST_IOCTL "history_ioctl" #define ZFS_EV_HIST_DSNAME "history_dsname" #define ZFS_EV_HIST_DSID "history_dsid" #define ZFS_EV_RESILVER_TYPE "resilver_type" /* * We currently support block sizes from 512 bytes to 16MB. * The benefits of larger blocks, and thus larger IO, need to be weighed * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * * The recordsize property can not be set larger than zfs_max_recordsize * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near * zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. */ #define SPA_MINBLOCKSHIFT 9 #define SPA_OLD_MAXBLOCKSHIFT 17 #define SPA_MAXBLOCKSHIFT 24 #define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) #define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) #define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) /* supported encryption algorithms */ enum zio_encrypt { ZIO_CRYPT_INHERIT = 0, ZIO_CRYPT_ON, ZIO_CRYPT_OFF, ZIO_CRYPT_AES_128_CCM, ZIO_CRYPT_AES_192_CCM, ZIO_CRYPT_AES_256_CCM, ZIO_CRYPT_AES_128_GCM, ZIO_CRYPT_AES_192_GCM, ZIO_CRYPT_AES_256_GCM, ZIO_CRYPT_FUNCTIONS }; #define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM #define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF /* * xattr namespace prefixes. These are forbidden in xattr names. * * For cross-platform compatibility, xattrs in the user namespace should not be * prefixed with the namespace name, but for backwards compatibility with older * ZFS on Linux versions we do prefix the namespace. */ #define ZFS_XA_NS_FREEBSD_PREFIX "freebsd:" #define ZFS_XA_NS_FREEBSD_PREFIX_LEN strlen("freebsd:") #define ZFS_XA_NS_LINUX_SECURITY_PREFIX "security." #define ZFS_XA_NS_LINUX_SECURITY_PREFIX_LEN strlen("security.") #define ZFS_XA_NS_LINUX_SYSTEM_PREFIX "system." #define ZFS_XA_NS_LINUX_SYSTEM_PREFIX_LEN strlen("system.") #define ZFS_XA_NS_LINUX_TRUSTED_PREFIX "trusted." #define ZFS_XA_NS_LINUX_TRUSTED_PREFIX_LEN strlen("trusted.") #define ZFS_XA_NS_LINUX_USER_PREFIX "user." #define ZFS_XA_NS_LINUX_USER_PREFIX_LEN strlen("user.") #define ZFS_XA_NS_PREFIX_MATCH(ns, name) \ (strncmp(name, ZFS_XA_NS_##ns##_PREFIX, \ ZFS_XA_NS_##ns##_PREFIX_LEN) == 0) #define ZFS_XA_NS_PREFIX_FORBIDDEN(name) \ (ZFS_XA_NS_PREFIX_MATCH(FREEBSD, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_SECURITY, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_SYSTEM, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_TRUSTED, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_USER, name)) #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_H */ diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 0c72c6881f0f..fad2c8bfa695 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -1,242 +1,244 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _SYS_ZAP_IMPL_H #define _SYS_ZAP_IMPL_H #include #include #include #ifdef __cplusplus extern "C" { #endif extern int fzap_default_block_shift; #define ZAP_MAGIC 0x2F52AB2ABULL #define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) #define MZAP_ENT_LEN 64 #define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) -#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE #define ZAP_NEED_CD (-1U) typedef struct mzap_ent_phys { uint64_t mze_value; uint32_t mze_cd; uint16_t mze_pad; /* in case we want to chain them someday */ char mze_name[MZAP_NAME_LEN]; } mzap_ent_phys_t; typedef struct mzap_phys { uint64_t mz_block_type; /* ZBT_MICRO */ uint64_t mz_salt; uint64_t mz_normflags; uint64_t mz_pad[5]; mzap_ent_phys_t mz_chunk[1]; /* actually variable size depending on block size */ } mzap_phys_t; typedef struct mzap_ent { uint32_t mze_hash; uint16_t mze_cd; /* copy from mze_phys->mze_cd */ uint16_t mze_chunkid; } mzap_ent_t; #define MZE_PHYS(zap, mze) \ (&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid]) /* * The (fat) zap is stored in one object. It is an array of * 1<= 6] [zap_leaf_t] [ptrtbl] ... * */ struct dmu_buf; struct zap_leaf; #define ZBT_LEAF ((1ULL << 63) + 0) #define ZBT_HEADER ((1ULL << 63) + 1) #define ZBT_MICRO ((1ULL << 63) + 3) /* any other values are ptrtbl blocks */ /* * the embedded pointer table takes up half a block: * block size / entry size (2^3) / 2 */ #define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) /* * The embedded pointer table starts half-way through the block. Since * the pointer table itself is half the block, it starts at (64-bit) * word number (1<zap_dbuf->db_data); } static inline mzap_phys_t * zap_m_phys(zap_t *zap) { return (zap->zap_dbuf->db_data); } typedef struct zap_name { zap_t *zn_zap; int zn_key_intlen; const void *zn_key_orig; int zn_key_orig_numints; const void *zn_key_norm; int zn_key_norm_numints; uint64_t zn_hash; matchtype_t zn_matchtype; int zn_normflags; int zn_normbuf_len; char zn_normbuf[]; } zap_name_t; #define zap_f zap_u.zap_fat #define zap_m zap_u.zap_micro boolean_t zap_match(zap_name_t *zn, const char *matchname); int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, zap_t **zapp); void zap_unlockdir(zap_t *zap, const void *tag); void zap_evict_sync(void *dbu); zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); uint32_t zap_maxcd(zap_t *zap); uint64_t zap_getflags(zap_t *zap); +uint64_t zap_get_micro_max_size(spa_t *spa); + #define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) void fzap_byteswap(void *buf, size_t size); int fzap_count(zap_t *zap, uint64_t *count); int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *normalization_conflictp); void fzap_prefetch(zap_name_t *zn); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, const void *tag, dmu_tx_t *tx); int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, const void *val, const void *tag, dmu_tx_t *tx); int fzap_length(zap_name_t *zn, uint64_t *integer_size, uint64_t *num_integers); int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); void fzap_get_stats(zap_t *zap, zap_stats_t *zs); void zap_put_leaf(struct zap_leaf *l); int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx); void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); #ifdef __cplusplus } #endif #endif /* _SYS_ZAP_IMPL_H */ diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 470b2ed5f7cb..aa20e52a7634 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -1,585 +1,588 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _SYS_ZFS_IOCTL_H #define _SYS_ZFS_IOCTL_H #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* _KERNEL */ #ifdef __cplusplus extern "C" { #endif /* * The structures in this file are passed between userland and the * kernel. Userland may be running a 32-bit process, while the kernel * is 64-bit. Therefore, these structures need to compile the same in * 32-bit and 64-bit. This means not using type "long", and adding * explicit padding so that the 32-bit structure will not be packed more * tightly than the 64-bit structure (which requires 64-bit alignment). */ /* * Property values for snapdir */ #define ZFS_SNAPDIR_HIDDEN 0 #define ZFS_SNAPDIR_VISIBLE 1 #define ZFS_SNAPDIR_DISABLED 2 /* * Property values for snapdev */ #define ZFS_SNAPDEV_HIDDEN 0 #define ZFS_SNAPDEV_VISIBLE 1 /* * Property values for acltype */ #define ZFS_ACLTYPE_OFF 0 #define ZFS_ACLTYPE_POSIX 1 #define ZFS_ACLTYPE_NFSV4 2 /* * The drr_versioninfo field of the dmu_replay_record has the * following layout: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * |reserve| feature-flags |C|S| * +-------+-------+-------+-------+-------+-------+-------+-------+ * * The low order two bits indicate the header type: SUBSTREAM (0x1) * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: * this field used to be a version number, where the two version types * were 1 and 2. Using two bits for this allows earlier versions of * the code to be able to recognize send streams that don't use any * of the features indicated by feature flags. * * The top 8 bits are reserved for future expansion. At time of writing there * are no plans for these. If you want to use them, please reach out to the * OpenZFS community, e.g., on GitHub or Slack. */ /* * Field manipulation macros for the drr_versioninfo field of the * send stream header. */ #define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) #define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) #define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 56) #define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 56, x) /* * Header types for zfs send streams. */ typedef enum drr_headertype { DMU_SUBSTREAM = 0x1, DMU_COMPOUNDSTREAM = 0x2 } drr_headertype_t; /* * Feature flags for zfs send streams (flags in drr_versioninfo) */ #define DMU_BACKUP_FEATURE_DEDUP (1 << 0) #define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) #define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) /* flags #3 - #15 are reserved for incompatible closed-source implementations */ #define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) #define DMU_BACKUP_FEATURE_LZ4 (1 << 17) /* flag #18 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) #define DMU_BACKUP_FEATURE_RESUMING (1 << 20) #define DMU_BACKUP_FEATURE_REDACTED (1 << 21) #define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) #define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) #define DMU_BACKUP_FEATURE_RAW (1 << 24) #define DMU_BACKUP_FEATURE_ZSTD (1 << 25) #define DMU_BACKUP_FEATURE_HOLDS (1 << 26) /* * The SWITCH_TO_LARGE_BLOCKS feature indicates that we can receive * incremental LARGE_BLOCKS streams (those with WRITE records of >128KB) even * if the previous send did not use LARGE_BLOCKS, and thus its large blocks * were split into multiple 128KB WRITE records. (See * flush_write_batch_impl() and receive_object()). Older software that does * not support this flag may encounter a bug when switching to large blocks, * which causes files to incorrectly be zeroed. * * This flag is currently not set on any send streams. In the future, we * intend for incremental send streams of snapshots that have large blocks to * use LARGE_BLOCKS by default, and these streams will also have the * SWITCH_TO_LARGE_BLOCKS feature set. This ensures that streams from the * default use of "zfs send" won't encounter the bug mentioned above. */ #define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27) #define DMU_BACKUP_FEATURE_LONGNAME (1 << 28) +#define DMU_BACKUP_FEATURE_LARGE_MICROZAP (1 << 29) /* * Mask of all supported backup features */ #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_SA_SPILL | \ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_LARGE_BLOCKS | \ DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \ DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS | \ - DMU_BACKUP_FEATURE_ZSTD | DMU_BACKUP_FEATURE_LONGNAME) + DMU_BACKUP_FEATURE_ZSTD | DMU_BACKUP_FEATURE_LONGNAME | \ + DMU_BACKUP_FEATURE_LARGE_MICROZAP) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) typedef enum dmu_send_resume_token_version { ZFS_SEND_RESUME_TOKEN_VERSION = 1 } dmu_send_resume_token_version_t; #define DMU_BACKUP_MAGIC 0x2F5bacbacULL /* * Send stream flags. Bits 24-31 are reserved for vendor-specific * implementations and should not be used. */ #define DRR_FLAG_CLONE (1<<0) #define DRR_FLAG_CI_DATA (1<<1) /* * This send stream, if it is a full send, includes the FREE and FREEOBJECT * records that are created by the sending process. This means that the send * stream can be received as a clone, even though it is not an incremental. * This is not implemented as a feature flag, because the receiving side does * not need to have implemented it to receive this stream; it is fully backwards * compatible. We need a flag, though, because full send streams without it * cannot necessarily be received as a clone correctly. */ #define DRR_FLAG_FREERECORDS (1<<2) /* * When DRR_FLAG_SPILL_BLOCK is set it indicates the DRR_OBJECT_SPILL * and DRR_SPILL_UNMODIFIED flags are meaningful in the send stream. * * When DRR_FLAG_SPILL_BLOCK is set, DRR_OBJECT records will have * DRR_OBJECT_SPILL set if and only if they should have a spill block * (either an existing one, or a new one in the send stream). When clear * the object does not have a spill block and any existing spill block * should be freed. * * Similarly, when DRR_FLAG_SPILL_BLOCK is set, DRR_SPILL records will * have DRR_SPILL_UNMODIFIED set if and only if they were included for * backward compatibility purposes, and can be safely ignored by new versions * of zfs receive. Previous versions of ZFS which do not understand the * DRR_FLAG_SPILL_BLOCK will process this record and recreate any missing * spill blocks. */ #define DRR_FLAG_SPILL_BLOCK (1<<3) /* * flags in the drr_flags field in the DRR_WRITE, DRR_SPILL, DRR_OBJECT, * DRR_WRITE_BYREF, and DRR_OBJECT_RANGE blocks */ #define DRR_CHECKSUM_DEDUP (1<<0) /* not used for SPILL records */ #define DRR_RAW_BYTESWAP (1<<1) #define DRR_OBJECT_SPILL (1<<2) /* OBJECT record has a spill block */ #define DRR_SPILL_UNMODIFIED (1<<2) /* SPILL record for unmodified block */ #define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) #define DRR_IS_RAW_BYTESWAPPED(flags) ((flags) & DRR_RAW_BYTESWAP) #define DRR_OBJECT_HAS_SPILL(flags) ((flags) & DRR_OBJECT_SPILL) #define DRR_SPILL_IS_UNMODIFIED(flags) ((flags) & DRR_SPILL_UNMODIFIED) /* deal with compressed drr_write replay records */ #define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0) #define DRR_WRITE_PAYLOAD_SIZE(drrw) \ (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \ (drrw)->drr_logical_size) #define DRR_SPILL_PAYLOAD_SIZE(drrs) \ ((drrs)->drr_compressed_size ? \ (drrs)->drr_compressed_size : (drrs)->drr_length) #define DRR_OBJECT_PAYLOAD_SIZE(drro) \ ((drro)->drr_raw_bonuslen != 0 ? \ (drro)->drr_raw_bonuslen : P2ROUNDUP((drro)->drr_bonuslen, 8)) /* Header is used in C++ so can't forward declare untagged struct */ struct drr_begin { uint64_t drr_magic; uint64_t drr_versioninfo; /* was drr_version */ uint64_t drr_creation_time; dmu_objset_type_t drr_type; uint32_t drr_flags; uint64_t drr_toguid; uint64_t drr_fromguid; char drr_toname[MAXNAMELEN]; }; typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_OBJECT_RANGE, DRR_REDACT, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { struct drr_begin drr_begin; struct drr_end { zio_cksum_t drr_checksum; uint64_t drr_toguid; } drr_end; struct drr_object { uint64_t drr_object; dmu_object_type_t drr_type; dmu_object_type_t drr_bonustype; uint32_t drr_blksz; uint32_t drr_bonuslen; uint8_t drr_checksumtype; uint8_t drr_compress; uint8_t drr_dn_slots; uint8_t drr_flags; uint32_t drr_raw_bonuslen; uint64_t drr_toguid; /* only (possibly) nonzero for raw streams */ uint8_t drr_indblkshift; uint8_t drr_nlevels; uint8_t drr_nblkptr; uint8_t drr_pad[5]; uint64_t drr_maxblkid; /* bonus content follows */ } drr_object; struct drr_freeobjects { uint64_t drr_firstobj; uint64_t drr_numobjs; uint64_t drr_toguid; } drr_freeobjects; struct drr_write { uint64_t drr_object; dmu_object_type_t drr_type; uint32_t drr_pad; uint64_t drr_offset; uint64_t drr_logical_size; uint64_t drr_toguid; uint8_t drr_checksumtype; uint8_t drr_flags; uint8_t drr_compressiontype; uint8_t drr_pad2[5]; /* deduplication key */ ddt_key_t drr_key; /* only nonzero if drr_compressiontype is not 0 */ uint64_t drr_compressed_size; /* only nonzero for raw streams */ uint8_t drr_salt[ZIO_DATA_SALT_LEN]; uint8_t drr_iv[ZIO_DATA_IV_LEN]; uint8_t drr_mac[ZIO_DATA_MAC_LEN]; /* content follows */ } drr_write; struct drr_free { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; } drr_free; struct drr_write_byref { /* where to put the data */ uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; /* where to find the prior copy of the data */ uint64_t drr_refguid; uint64_t drr_refobject; uint64_t drr_refoffset; /* properties of the data */ uint8_t drr_checksumtype; uint8_t drr_flags; uint8_t drr_pad2[6]; ddt_key_t drr_key; /* deduplication key */ } drr_write_byref; struct drr_spill { uint64_t drr_object; uint64_t drr_length; uint64_t drr_toguid; uint8_t drr_flags; uint8_t drr_compressiontype; uint8_t drr_pad[6]; /* only nonzero for raw streams */ uint64_t drr_compressed_size; uint8_t drr_salt[ZIO_DATA_SALT_LEN]; uint8_t drr_iv[ZIO_DATA_IV_LEN]; uint8_t drr_mac[ZIO_DATA_MAC_LEN]; dmu_object_type_t drr_type; /* spill data follows */ } drr_spill; struct drr_write_embedded { uint64_t drr_object; uint64_t drr_offset; /* logical length, should equal blocksize */ uint64_t drr_length; uint64_t drr_toguid; uint8_t drr_compression; uint8_t drr_etype; uint8_t drr_pad[6]; uint32_t drr_lsize; /* uncompressed size of payload */ uint32_t drr_psize; /* compr. (real) size of payload */ /* (possibly compressed) content follows */ } drr_write_embedded; struct drr_object_range { uint64_t drr_firstobj; uint64_t drr_numslots; uint64_t drr_toguid; uint8_t drr_salt[ZIO_DATA_SALT_LEN]; uint8_t drr_iv[ZIO_DATA_IV_LEN]; uint8_t drr_mac[ZIO_DATA_MAC_LEN]; uint8_t drr_flags; uint8_t drr_pad[3]; } drr_object_range; struct drr_redact { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; } drr_redact; /* * Note: drr_checksum is overlaid with all record types * except DRR_BEGIN. Therefore its (non-pad) members * must not overlap with members from the other structs. * We accomplish this by putting its members at the very * end of the struct. */ struct drr_checksum { uint64_t drr_pad[34]; /* * fletcher-4 checksum of everything preceding the * checksum. */ zio_cksum_t drr_checksum; } drr_checksum; } drr_u; } dmu_replay_record_t; /* diff record range types */ typedef enum diff_type { DDR_NONE = 0x1, DDR_INUSE = 0x2, DDR_FREE = 0x4 } diff_type_t; /* * The diff reports back ranges of free or in-use objects. */ typedef struct dmu_diff_record { uint64_t ddr_type; uint64_t ddr_first; uint64_t ddr_last; } dmu_diff_record_t; typedef struct zinject_record { uint64_t zi_objset; uint64_t zi_object; uint64_t zi_start; uint64_t zi_end; uint64_t zi_guid; uint32_t zi_level; uint32_t zi_error; uint64_t zi_type; uint32_t zi_freq; uint32_t zi_failfast; char zi_func[MAXNAMELEN]; uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; uint64_t zi_nlanes; uint32_t zi_cmd; uint32_t zi_dvas; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 #define ZINJECT_CALC_RANGE 0x8 #define ZEVENT_NONE 0x0 #define ZEVENT_NONBLOCK 0x1 #define ZEVENT_SIZE 1024 #define ZEVENT_SEEK_START 0 #define ZEVENT_SEEK_END UINT64_MAX /* scaled frequency ranges */ #define ZI_PERCENTAGE_MIN 4294UL #define ZI_PERCENTAGE_MAX UINT32_MAX #define ZI_NO_DVA (-1) typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, ZINJECT_DEVICE_FAULT, ZINJECT_LABEL_FAULT, ZINJECT_IGNORED_WRITES, ZINJECT_PANIC, ZINJECT_DELAY_IO, ZINJECT_DECRYPT_FAULT, ZINJECT_DELAY_IMPORT, ZINJECT_DELAY_EXPORT, } zinject_type_t; typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; uint64_t z_sharetype; /* 0 = share, 1 = unshare */ uint64_t z_sharemax; /* max length of share string */ } zfs_share_t; /* * ZFS file systems may behave the usual, POSIX-compliant way, where * name lookups are case-sensitive. They may also be set up so that * all the name lookups are case-insensitive, or so that only some * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. */ typedef enum zfs_case { ZFS_CASE_SENSITIVE, ZFS_CASE_INSENSITIVE, ZFS_CASE_MIXED } zfs_case_t; /* * zfs ioctl command structure */ /* * Note: this struct must have the same layout in 32-bit and 64-bit, so * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit * kernel. Therefore, we add padding to it so that no "hidden" padding * is automatically added on 64-bit (but not on 32-bit). */ typedef struct zfs_cmd { char zc_name[MAXPATHLEN]; /* name of pool or dataset */ uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ int zc_pad2; /* * The following members are for legacy ioctls which haven't been * converted to the new method. */ uint64_t zc_history; /* really (char *) */ char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_t zc_inject_record; uint32_t zc_defer_destroy; uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; uint8_t zc_pad[3]; /* alignment */ uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; uint64_t zc_zoneid; } zfs_cmd_t; typedef struct zfs_useracct { char zu_domain[256]; uid_t zu_rid; uint32_t zu_pad; uint64_t zu_space; } zfs_useracct_t; #define ZFSDEV_MAX_MINOR (1 << 16) #define ZPOOL_EXPORT_AFTER_SPLIT 0x1 #ifdef _KERNEL struct objset; struct zfsvfs; typedef struct zfs_creat { nvlist_t *zct_zplprops; nvlist_t *zct_props; } zfs_creat_t; extern int zfs_secpolicy_snapshot_perms(const char *, cred_t *); extern int zfs_secpolicy_rename_perms(const char *, const char *, cred_t *); extern int zfs_secpolicy_destroy_perms(const char *, cred_t *); extern void zfs_unmount_snap(const char *); extern void zfs_destroy_unmount_origin(const char *); extern int getzfsvfs_impl(struct objset *, struct zfsvfs **); extern int getzfsvfs(const char *, struct zfsvfs **); enum zfsdev_state_type { ZST_ONEXIT, ZST_ZEVENT, ZST_ALL, }; /* * The zfsdev_state_t structure is managed as a singly-linked list * from which items are never deleted. This allows for lock-free * reading of the list so long as assignments to the zs_next and * reads from zs_minor are performed atomically. Empty items are * indicated by storing -1 into zs_minor. */ typedef struct zfsdev_state { struct zfsdev_state *zs_next; /* next zfsdev_state_t link */ minor_t zs_minor; /* made up minor number */ void *zs_onexit; /* onexit data */ void *zs_zevent; /* zevent data */ } zfsdev_state_t; extern void *zfsdev_get_state(minor_t minor, enum zfsdev_state_type which); extern int zfsdev_getminor(zfs_file_t *fp, minor_t *minorp); extern uint_t zfs_allow_log_key; #endif /* _KERNEL */ #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_IOCTL_H */ diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 1ca122d30ef5..ac42b5c0cd6b 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -1,144 +1,146 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _ZFEATURE_COMMON_H #define _ZFEATURE_COMMON_H extern __attribute__((visibility("default"))) #include #include #include #ifdef __cplusplus extern "C" { #endif struct zfeature_info; typedef enum spa_feature { SPA_FEATURE_NONE = -1, SPA_FEATURE_ASYNC_DESTROY, SPA_FEATURE_EMPTY_BPOBJ, SPA_FEATURE_LZ4_COMPRESS, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, SPA_FEATURE_SPACEMAP_HISTOGRAM, SPA_FEATURE_ENABLED_TXG, SPA_FEATURE_HOLE_BIRTH, SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_EMBEDDED_DATA, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_FS_SS_LIMIT, SPA_FEATURE_LARGE_BLOCKS, SPA_FEATURE_LARGE_DNODE, SPA_FEATURE_SHA512, SPA_FEATURE_SKEIN, SPA_FEATURE_EDONR, SPA_FEATURE_USEROBJ_ACCOUNTING, SPA_FEATURE_ENCRYPTION, SPA_FEATURE_PROJECT_QUOTA, SPA_FEATURE_DEVICE_REMOVAL, SPA_FEATURE_OBSOLETE_COUNTS, SPA_FEATURE_POOL_CHECKPOINT, SPA_FEATURE_SPACEMAP_V2, SPA_FEATURE_ALLOCATION_CLASSES, SPA_FEATURE_RESILVER_DEFER, SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_REDACTION_BOOKMARKS, SPA_FEATURE_REDACTED_DATASETS, SPA_FEATURE_BOOKMARK_WRITTEN, SPA_FEATURE_LOG_SPACEMAP, SPA_FEATURE_LIVELIST, SPA_FEATURE_DEVICE_REBUILD, SPA_FEATURE_ZSTD_COMPRESS, SPA_FEATURE_DRAID, SPA_FEATURE_ZILSAXATTR, SPA_FEATURE_HEAD_ERRLOG, SPA_FEATURE_BLAKE3, SPA_FEATURE_BLOCK_CLONING, SPA_FEATURE_AVZ_V2, SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURE_RAIDZ_EXPANSION, SPA_FEATURE_FAST_DEDUP, SPA_FEATURE_LONGNAME, + SPA_FEATURE_LARGE_MICROZAP, SPA_FEATURES } spa_feature_t; #define SPA_FEATURE_DISABLED (-1ULL) typedef enum zfeature_flags { /* Can open pool readonly even if this feature is not supported. */ ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0), /* * Is this feature necessary to load the pool? i.e. do we need this * feature to read the full feature list out of the MOS? */ ZFEATURE_FLAG_MOS = (1 << 1), /* Activate this feature at the same time it is enabled. */ ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2), /* Each dataset has a field set if it has ever used this feature. */ ZFEATURE_FLAG_PER_DATASET = (1 << 3) } zfeature_flags_t; typedef enum zfeature_type { ZFEATURE_TYPE_BOOLEAN, ZFEATURE_TYPE_UINT64_ARRAY, ZFEATURE_NUM_TYPES } zfeature_type_t; typedef struct zfeature_info { spa_feature_t fi_feature; const char *fi_uname; /* User-facing feature name */ const char *fi_guid; /* On-disk feature identifier */ const char *fi_desc; /* Feature description */ zfeature_flags_t fi_flags; boolean_t fi_zfs_mod_supported; /* supported by running zfs module */ zfeature_type_t fi_type; /* Only relevant for PER_DATASET features */ /* array of dependencies, terminated by SPA_FEATURE_NONE */ const spa_feature_t *fi_depends; } zfeature_info_t; typedef int (zfeature_func_t)(zfeature_info_t *, void *); #define ZFS_FEATURE_DEBUG _ZFEATURE_COMMON_H zfeature_info_t spa_feature_table[SPA_FEATURES]; _ZFEATURE_COMMON_H boolean_t zfeature_checks_disable; _ZFEATURE_COMMON_H boolean_t zfeature_is_valid_guid(const char *); _ZFEATURE_COMMON_H boolean_t zfeature_is_supported(const char *); _ZFEATURE_COMMON_H int zfeature_lookup_guid(const char *, spa_feature_t *); _ZFEATURE_COMMON_H int zfeature_lookup_name(const char *, spa_feature_t *); _ZFEATURE_COMMON_H boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t); _ZFEATURE_COMMON_H void zpool_feature_init(void); #ifdef __cplusplus } #endif #endif /* _ZFEATURE_COMMON_H */ diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 782192eb8235..1a96460c2b84 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -1,10126 +1,10127 @@ - + - + + - - + + - + diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index ee01ee9b218a..b9780720e5a3 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -1,5628 +1,5634 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * All rights reserved * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018, loli10K . All rights reserved. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2024, Klara, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_fletcher.h" #include "libzfs_impl.h" #include #include #include #include #include #include #include static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, const char *, nvlist_t *); static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, uint64_t num_redact_snaps, char *name); static int guid_to_name(libzfs_handle_t *, const char *, uint64_t, boolean_t, char *); typedef struct progress_arg { zfs_handle_t *pa_zhp; int pa_fd; boolean_t pa_parsable; boolean_t pa_estimate; int pa_verbosity; boolean_t pa_astitle; boolean_t pa_progress; uint64_t pa_size; } progress_arg_t; static int dump_record(dmu_replay_record_t *drr, void *payload, size_t payload_len, zio_cksum_t *zc, int outfd) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } return (0); } /* * Routines for dealing with the AVL tree of fs-nvlists */ typedef struct fsavl_node { avl_node_t fn_node; nvlist_t *fn_nvfs; const char *fn_snapname; uint64_t fn_guid; } fsavl_node_t; static int fsavl_compare(const void *arg1, const void *arg2) { const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; return (TREE_CMP(fn1->fn_guid, fn2->fn_guid)); } /* * Given the GUID of a snapshot, find its containing filesystem and * (optionally) name. */ static nvlist_t * fsavl_find(avl_tree_t *avl, uint64_t snapguid, const char **snapname) { fsavl_node_t fn_find; fsavl_node_t *fn; fn_find.fn_guid = snapguid; fn = avl_find(avl, &fn_find, NULL); if (fn) { if (snapname) *snapname = fn->fn_snapname; return (fn->fn_nvfs); } return (NULL); } static void fsavl_destroy(avl_tree_t *avl) { fsavl_node_t *fn; void *cookie; if (avl == NULL) return; cookie = NULL; while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL) free(fn); avl_destroy(avl); free(avl); } /* * Given an nvlist, produce an avl tree of snapshots, ordered by guid */ static avl_tree_t * fsavl_create(nvlist_t *fss) { avl_tree_t *fsavl; nvpair_t *fselem = NULL; if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL) return (NULL); avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t), offsetof(fsavl_node_t, fn_node)); while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) { nvlist_t *nvfs, *snaps; nvpair_t *snapelem = NULL; nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(nvfs, "snaps"); while ((snapelem = nvlist_next_nvpair(snaps, snapelem)) != NULL) { fsavl_node_t *fn; if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) { fsavl_destroy(fsavl); return (NULL); } fn->fn_nvfs = nvfs; fn->fn_snapname = nvpair_name(snapelem); fn->fn_guid = fnvpair_value_uint64(snapelem); /* * Note: if there are multiple snaps with the * same GUID, we ignore all but one. */ avl_index_t where = 0; if (avl_find(fsavl, fn, &where) == NULL) avl_insert(fsavl, fn, where); else free(fn); } } return (fsavl); } /* * Routines for dealing with the giant nvlist of fs-nvlists, etc. */ typedef struct send_data { /* * assigned inside every recursive call, * restored from *_save on return: * * guid of fromsnap snapshot in parent dataset * txg of fromsnap snapshot in current dataset * txg of tosnap snapshot in current dataset */ uint64_t parent_fromsnap_guid; uint64_t fromsnap_txg; uint64_t tosnap_txg; /* the nvlists get accumulated during depth-first traversal */ nvlist_t *parent_snaps; nvlist_t *fss; nvlist_t *snapprops; nvlist_t *snapholds; /* user holds */ /* send-receive configuration, does not change during traversal */ const char *fsname; const char *fromsnap; const char *tosnap; boolean_t recursive; boolean_t raw; boolean_t doall; boolean_t replicate; boolean_t skipmissing; boolean_t verbose; boolean_t backup; boolean_t seenfrom; boolean_t seento; boolean_t holds; /* were holds requested with send -h */ boolean_t props; /* * The header nvlist is of the following format: * { * "tosnap" -> string * "fromsnap" -> string (if incremental) * "fss" -> { * id -> { * * "name" -> string (full name; for debugging) * "parentfromsnap" -> number (guid of fromsnap in parent) * * "props" -> { name -> value (only if set here) } * "snaps" -> { name (lastname) -> number (guid) } * "snapprops" -> { name (lastname) -> { name -> value } } * "snapholds" -> { name (lastname) -> { holdname -> crtime } } * * "origin" -> number (guid) (if clone) * "is_encroot" -> boolean * "sent" -> boolean (not on-disk) * } * } * } * */ } send_data_t; static void send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv); /* * Collect guid, valid props, optionally holds, etc. of a snapshot. * This interface is intended for use as a zfs_iter_snapshots_v2_sorted visitor. */ static int send_iterate_snap(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; boolean_t isfromsnap, istosnap, istosnapwithnofrom; char *snapname; const char *from = sd->fromsnap; const char *to = sd->tosnap; snapname = strrchr(zhp->zfs_name, '@'); assert(snapname != NULL); ++snapname; isfromsnap = (from != NULL && strcmp(from, snapname) == 0); istosnap = (to != NULL && strcmp(to, snapname) == 0); istosnapwithnofrom = (istosnap && from == NULL); if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping snapshot %s because it was created " "after the destination snapshot (%s)\n"), zhp->zfs_name, to); } zfs_close(zhp); return (0); } fnvlist_add_uint64(sd->parent_snaps, snapname, guid); /* * NB: if there is no fromsnap here (it's a newly created fs in * an incremental replication), we will substitute the tosnap. */ if (isfromsnap || (sd->parent_fromsnap_guid == 0 && istosnap)) sd->parent_fromsnap_guid = guid; if (!sd->recursive) { /* * To allow a doall stream to work properly * with a NULL fromsnap */ if (sd->doall && from == NULL && !sd->seenfrom) sd->seenfrom = B_TRUE; if (!sd->seenfrom && isfromsnap) { sd->seenfrom = B_TRUE; zfs_close(zhp); return (0); } if ((sd->seento || !sd->seenfrom) && !istosnapwithnofrom) { zfs_close(zhp); return (0); } if (istosnap) sd->seento = B_TRUE; } nvlist_t *nv = fnvlist_alloc(); send_iterate_prop(zhp, sd->backup, nv); fnvlist_add_nvlist(sd->snapprops, snapname, nv); fnvlist_free(nv); if (sd->holds) { nvlist_t *holds; if (lzc_get_holds(zhp->zfs_name, &holds) == 0) { fnvlist_add_nvlist(sd->snapholds, snapname, holds); fnvlist_free(holds); } } zfs_close(zhp); return (0); } /* * Collect all valid props from the handle snap into an nvlist. */ static void send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv) { nvlist_t *props; if (received_only) props = zfs_get_recvd_props(zhp); else props = zhp->zfs_props; nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { const char *propname = nvpair_name(elem); zfs_prop_t prop = zfs_name_to_prop(propname); if (!zfs_prop_user(propname)) { /* * Realistically, this should never happen. However, * we want the ability to add DSL properties without * needing to make incompatible version changes. We * need to ignore unknown properties to allow older * software to still send datasets containing these * properties, with the unknown properties elided. */ if (prop == ZPROP_INVAL) continue; if (zfs_prop_readonly(prop)) continue; } nvlist_t *propnv = fnvpair_value_nvlist(elem); boolean_t isspacelimit = (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION); if (isspacelimit && zhp->zfs_type == ZFS_TYPE_SNAPSHOT) continue; const char *source; if (nvlist_lookup_string(propnv, ZPROP_SOURCE, &source) == 0) { if (strcmp(source, zhp->zfs_name) != 0 && strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0) continue; } else { /* * May have no source before SPA_VERSION_RECVD_PROPS, * but is still modifiable. */ if (!isspacelimit) continue; } if (zfs_prop_user(propname) || zfs_prop_get_type(prop) == PROP_TYPE_STRING) { const char *value; value = fnvlist_lookup_string(propnv, ZPROP_VALUE); fnvlist_add_string(nv, propname, value); } else { uint64_t value; value = fnvlist_lookup_uint64(propnv, ZPROP_VALUE); fnvlist_add_uint64(nv, propname, value); } } } /* * returns snapshot guid * and returns 0 if the snapshot does not exist */ static uint64_t get_snap_guid(libzfs_handle_t *hdl, const char *fs, const char *snap) { char name[MAXPATHLEN + 1]; uint64_t guid = 0; if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') return (guid); (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { guid = zfs_prop_get_int(zhp, ZFS_PROP_GUID); zfs_close(zhp); } return (guid); } /* * returns snapshot creation txg * and returns 0 if the snapshot does not exist */ static uint64_t get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap) { char name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t txg = 0; if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') return (txg); (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) { zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG); zfs_close(zhp); } } return (txg); } /* * Recursively generate nvlists describing datasets. See comment * for the data structure send_data_t above for description of contents * of the nvlist. */ static int send_iterate_fs(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; nvlist_t *nvfs = NULL, *nv = NULL; int rv = 0; uint64_t min_txg = 0, max_txg = 0; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t fromsnap_txg, tosnap_txg; char guidstring[64]; /* These fields are restored on return from a recursive call. */ uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid; uint64_t fromsnap_txg_save = sd->fromsnap_txg; uint64_t tosnap_txg_save = sd->tosnap_txg; fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap); if (fromsnap_txg != 0) sd->fromsnap_txg = fromsnap_txg; tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap); if (tosnap_txg != 0) sd->tosnap_txg = tosnap_txg; /* * On the send side, if the current dataset does not have tosnap, * perform two additional checks: * * - Skip sending the current dataset if it was created later than * the parent tosnap. * - Return error if the current dataset was created earlier than * the parent tosnap, unless --skip-missing specified. Then * just print a warning. */ if (sd->tosnap != NULL && tosnap_txg == 0) { if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping dataset %s: snapshot %s does " "not exist\n"), zhp->zfs_name, sd->tosnap); } } else if (sd->skipmissing) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: skipping dataset %s and its children:" " snapshot %s does not exist\n"), zhp->zfs_name, sd->tosnap); } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "cannot send %s@%s%s: snapshot %s@%s does not " "exist\n"), sd->fsname, sd->tosnap, sd->recursive ? dgettext(TEXT_DOMAIN, " recursively") : "", zhp->zfs_name, sd->tosnap); rv = EZFS_NOENT; } goto out; } nvfs = fnvlist_alloc(); fnvlist_add_string(nvfs, "name", zhp->zfs_name); fnvlist_add_uint64(nvfs, "parentfromsnap", sd->parent_fromsnap_guid); if (zhp->zfs_dmustats.dds_origin[0] != '\0') { zfs_handle_t *origin = zfs_open(zhp->zfs_hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); if (origin == NULL) { rv = -1; goto out; } fnvlist_add_uint64(nvfs, "origin", origin->zfs_dmustats.dds_guid); zfs_close(origin); } /* Iterate over props. */ if (sd->props || sd->backup || sd->recursive) { nv = fnvlist_alloc(); send_iterate_prop(zhp, sd->backup, nv); fnvlist_add_nvlist(nvfs, "props", nv); } if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) { boolean_t encroot; /* Determine if this dataset is an encryption root. */ if (zfs_crypto_get_encryption_root(zhp, &encroot, NULL) != 0) { rv = -1; goto out; } if (encroot) fnvlist_add_boolean(nvfs, "is_encroot"); /* * Encrypted datasets can only be sent with properties if * the raw flag is specified because the receive side doesn't * currently have a mechanism for recursively asking the user * for new encryption parameters. */ if (!sd->raw) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "cannot send %s@%s: encrypted dataset %s may not " "be sent with properties without the raw flag\n"), sd->fsname, sd->tosnap, zhp->zfs_name); rv = -1; goto out; } } /* * Iterate over snaps, and set sd->parent_fromsnap_guid. * * If this is a "doall" send, a replicate send or we're just trying * to gather a list of previous snapshots, iterate through all the * snaps in the txg range. Otherwise just look at the one we're * interested in. */ sd->parent_fromsnap_guid = 0; sd->parent_snaps = fnvlist_alloc(); sd->snapprops = fnvlist_alloc(); if (sd->holds) sd->snapholds = fnvlist_alloc(); if (sd->doall || sd->replicate || sd->tosnap == NULL) { if (!sd->replicate && fromsnap_txg != 0) min_txg = fromsnap_txg; if (!sd->replicate && tosnap_txg != 0) max_txg = tosnap_txg; (void) zfs_iter_snapshots_sorted_v2(zhp, 0, send_iterate_snap, sd, min_txg, max_txg); } else { char snapname[MAXPATHLEN] = { 0 }; zfs_handle_t *snap; (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sd->tosnap); if (sd->fromsnap != NULL) sd->seenfrom = B_TRUE; snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) (void) send_iterate_snap(snap, sd); } fnvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps); fnvlist_free(sd->parent_snaps); fnvlist_add_nvlist(nvfs, "snapprops", sd->snapprops); fnvlist_free(sd->snapprops); if (sd->holds) { fnvlist_add_nvlist(nvfs, "snapholds", sd->snapholds); fnvlist_free(sd->snapholds); } /* Do not allow the size of the properties list to exceed the limit */ if ((fnvlist_size(nvfs) + fnvlist_size(sd->fss)) > zhp->zfs_hdl->libzfs_max_nvlist) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "warning: cannot send %s@%s: the size of the list of " "snapshots and properties is too large to be received " "successfully.\n" "Select a smaller number of snapshots to send.\n"), zhp->zfs_name, sd->tosnap); rv = EZFS_NOSPC; goto out; } /* Add this fs to nvlist. */ (void) snprintf(guidstring, sizeof (guidstring), "0x%llx", (longlong_t)guid); fnvlist_add_nvlist(sd->fss, guidstring, nvfs); /* Iterate over children. */ if (sd->recursive) rv = zfs_iter_filesystems_v2(zhp, 0, send_iterate_fs, sd); out: /* Restore saved fields. */ sd->parent_fromsnap_guid = parent_fromsnap_guid_save; sd->fromsnap_txg = fromsnap_txg_save; sd->tosnap_txg = tosnap_txg_save; fnvlist_free(nv); fnvlist_free(nvfs); zfs_close(zhp); return (rv); } static int gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, const char *tosnap, boolean_t recursive, boolean_t raw, boolean_t doall, boolean_t replicate, boolean_t skipmissing, boolean_t verbose, boolean_t backup, boolean_t holds, boolean_t props, nvlist_t **nvlp, avl_tree_t **avlp) { zfs_handle_t *zhp; send_data_t sd = { 0 }; int error; zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (EZFS_BADTYPE); sd.fss = fnvlist_alloc(); sd.fsname = fsname; sd.fromsnap = fromsnap; sd.tosnap = tosnap; sd.recursive = recursive; sd.raw = raw; sd.doall = doall; sd.replicate = replicate; sd.skipmissing = skipmissing; sd.verbose = verbose; sd.backup = backup; sd.holds = holds; sd.props = props; if ((error = send_iterate_fs(zhp, &sd)) != 0) { fnvlist_free(sd.fss); if (avlp != NULL) *avlp = NULL; *nvlp = NULL; return (error); } if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) { fnvlist_free(sd.fss); *nvlp = NULL; return (EZFS_NOMEM); } *nvlp = sd.fss; return (0); } /* * Routines specific to "zfs send" */ typedef struct send_dump_data { /* these are all just the short snapname (the part after the @) */ const char *fromsnap; const char *tosnap; char prevsnap[ZFS_MAX_DATASET_NAME_LEN]; uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; boolean_t dryrun, parsable, progress, embed_data, std_out; boolean_t large_block, compress, raw, holds; boolean_t progressastitle; int outfd; boolean_t err; nvlist_t *fss; nvlist_t *snapholds; avl_tree_t *fsavl; snapfilter_cb_t *filter_cb; void *filter_cb_arg; nvlist_t *debugnv; char holdtag[ZFS_MAX_DATASET_NAME_LEN]; int cleanup_fd; int verbosity; uint64_t size; } send_dump_data_t; static int zfs_send_space(zfs_handle_t *zhp, const char *snapname, const char *from, enum lzc_send_flags flags, uint64_t *spacep) { assert(snapname != NULL); int error = lzc_send_space(snapname, from, flags, spacep); if (error == 0) return (0); char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot estimate space for '%s'"), snapname); libzfs_handle_t *hdl = zhp->zfs_hdl; switch (error) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, snapname, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (%s) does not exist"), snapname); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: case EINVAL: zfs_error_aux(hdl, "%s", zfs_strerror(error)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, error, errbuf)); } } /* * Dumps a backup of the given snapshot (incremental from fromsnap if it's not * NULL) to the file descriptor specified by outfd. */ static int dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, boolean_t fromorigin, int outfd, enum lzc_send_flags flags, nvlist_t *debugnv) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *thisdbg; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); assert(fromsnap_obj == 0 || !fromorigin); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_cookie = outfd; zc.zc_obj = fromorigin; zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_fromobj = fromsnap_obj; zc.zc_flags = flags; if (debugnv != NULL) { thisdbg = fnvlist_alloc(); if (fromsnap != NULL && fromsnap[0] != '\0') fnvlist_add_string(thisdbg, "fromsnap", fromsnap); } if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[ERRBUFLEN]; int error = errno; (void) snprintf(errbuf, sizeof (errbuf), "%s '%s'", dgettext(TEXT_DOMAIN, "warning: cannot send"), zhp->zfs_name); if (debugnv != NULL) { fnvlist_add_uint64(thisdbg, "error", error); fnvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg); fnvlist_free(thisdbg); } switch (error) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "source key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (@%s) does not exist"), zc.zc_value); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: case EINVAL: zfs_error_aux(hdl, "%s", zfs_strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } if (debugnv != NULL) { fnvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg); fnvlist_free(thisdbg); } return (0); } static void gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) { assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); /* * zfs_send() only sets snapholds for sends that need them, * e.g. replication and doall. */ if (sdd->snapholds == NULL) return; fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); } int zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written, uint64_t *blocks_visited) { zfs_cmd_t zc = {"\0"}; if (bytes_written != NULL) *bytes_written = 0; if (blocks_visited != NULL) *blocks_visited = 0; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_cookie = fd; if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) return (errno); if (bytes_written != NULL) *bytes_written = zc.zc_cookie; if (blocks_visited != NULL) *blocks_visited = zc.zc_objset_type; return (0); } static volatile boolean_t send_progress_thread_signal_duetotimer; static void send_progress_thread_act(int sig, siginfo_t *info, void *ucontext) { (void) sig, (void) ucontext; send_progress_thread_signal_duetotimer = info->si_code == SI_TIMER; } struct timer_desirability { timer_t timer; boolean_t desired; }; static void timer_delete_cleanup(void *timer) { struct timer_desirability *td = timer; if (td->desired) timer_delete(td->timer); } #ifdef SIGINFO #define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO sigaddset(&new, SIGINFO) #else #define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO #endif #define SEND_PROGRESS_THREAD_PARENT_BLOCK(old) { \ sigset_t new; \ sigemptyset(&new); \ sigaddset(&new, SIGUSR1); \ SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO; \ pthread_sigmask(SIG_BLOCK, &new, old); \ } static void * send_progress_thread(void *arg) { progress_arg_t *pa = arg; zfs_handle_t *zhp = pa->pa_zhp; uint64_t bytes; uint64_t blocks; uint64_t total = pa->pa_size / 100; char buf[16]; time_t t; struct tm tm; int err; const struct sigaction signal_action = {.sa_sigaction = send_progress_thread_act, .sa_flags = SA_SIGINFO}; struct sigevent timer_cfg = {.sigev_notify = SIGEV_SIGNAL, .sigev_signo = SIGUSR1}; const struct itimerspec timer_time = {.it_value = {.tv_sec = 1}, .it_interval = {.tv_sec = 1}}; struct timer_desirability timer = {}; sigaction(SIGUSR1, &signal_action, NULL); #ifdef SIGINFO sigaction(SIGINFO, &signal_action, NULL); #endif if ((timer.desired = pa->pa_progress || pa->pa_astitle)) { if (timer_create(CLOCK_MONOTONIC, &timer_cfg, &timer.timer)) return ((void *)(uintptr_t)errno); (void) timer_settime(timer.timer, 0, &timer_time, NULL); } pthread_cleanup_push(timer_delete_cleanup, &timer); if (!pa->pa_parsable && pa->pa_progress) { (void) fprintf(stderr, "TIME %s %sSNAPSHOT %s\n", pa->pa_estimate ? "BYTES" : " SENT", pa->pa_verbosity >= 2 ? " BLOCKS " : "", zhp->zfs_name); } /* * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { pause(); if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes, &blocks)) != 0) { if (err == EINTR || err == ENOENT) err = 0; pthread_exit(((void *)(uintptr_t)err)); } (void) time(&t); localtime_r(&t, &tm); if (pa->pa_astitle) { char buf_bytes[16]; char buf_size[16]; int pct; zfs_nicenum(bytes, buf_bytes, sizeof (buf_bytes)); zfs_nicenum(pa->pa_size, buf_size, sizeof (buf_size)); pct = (total > 0) ? bytes / total : 100; zfs_setproctitle("sending %s (%d%%: %s/%s)", zhp->zfs_name, MIN(pct, 100), buf_bytes, buf_size); } if (pa->pa_verbosity >= 2 && pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%llu\t%s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, (u_longlong_t)bytes, (u_longlong_t)blocks, zhp->zfs_name); } else if (pa->pa_verbosity >= 2) { zfs_nicenum(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %8llu %s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, buf, (u_longlong_t)blocks, zhp->zfs_name); } else if (pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, (u_longlong_t)bytes, zhp->zfs_name); } else if (pa->pa_progress || !send_progress_thread_signal_duetotimer) { zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, buf, zhp->zfs_name); } } pthread_cleanup_pop(B_TRUE); return (NULL); } static boolean_t send_progress_thread_exit( libzfs_handle_t *hdl, pthread_t ptid, sigset_t *oldmask) { void *status = NULL; (void) pthread_cancel(ptid); (void) pthread_join(ptid, &status); pthread_sigmask(SIG_SETMASK, oldmask, NULL); int error = (int)(uintptr_t)status; if (error != 0 && status != PTHREAD_CANCELED) return (zfs_standard_error(hdl, error, dgettext(TEXT_DOMAIN, "progress thread exited nonzero"))); else return (B_FALSE); } static void send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, uint64_t size, boolean_t parsable) { if (parsable) { if (fromsnap != NULL) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "incremental\t%s\t%s"), fromsnap, tosnap); } else { /* * Workaround for GCC 12+ with UBSan enabled deficencies. * * GCC 12+ invoked with -fsanitize=undefined incorrectly reports the code * below as violating -Wformat-overflow. */ #if defined(__GNUC__) && !defined(__clang__) && \ defined(ZFS_UBSAN_ENABLED) && defined(HAVE_FORMAT_OVERFLOW) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat-overflow" #endif (void) fprintf(fout, dgettext(TEXT_DOMAIN, "full\t%s"), tosnap); #if defined(__GNUC__) && !defined(__clang__) && \ defined(ZFS_UBSAN_ENABLED) && defined(HAVE_FORMAT_OVERFLOW) #pragma GCC diagnostic pop #endif } (void) fprintf(fout, "\t%llu", (longlong_t)size); } else { if (fromsnap != NULL) { if (strchr(fromsnap, '@') == NULL && strchr(fromsnap, '#') == NULL) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from @%s to %s"), fromsnap, tosnap); } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from %s to %s"), fromsnap, tosnap); } } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "full send of %s"), tosnap); } if (size != 0) { char buf[16]; zfs_nicebytes(size, buf, sizeof (buf)); /* * Workaround for GCC 12+ with UBSan enabled deficencies. * * GCC 12+ invoked with -fsanitize=undefined incorrectly reports the code * below as violating -Wformat-overflow. */ #if defined(__GNUC__) && !defined(__clang__) && \ defined(ZFS_UBSAN_ENABLED) && defined(HAVE_FORMAT_OVERFLOW) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat-overflow" #endif (void) fprintf(fout, dgettext(TEXT_DOMAIN, " estimated size is %s"), buf); #if defined(__GNUC__) && !defined(__clang__) && \ defined(ZFS_UBSAN_ENABLED) && defined(HAVE_FORMAT_OVERFLOW) #pragma GCC diagnostic pop #endif } } (void) fprintf(fout, "\n"); } /* * Send a single filesystem snapshot, updating the send dump data. * This interface is intended for use as a zfs_iter_snapshots_v2_sorted visitor. */ static int dump_snapshot(zfs_handle_t *zhp, void *arg) { send_dump_data_t *sdd = arg; progress_arg_t pa = { 0 }; pthread_t tid; char *thissnap; enum lzc_send_flags flags = 0; int err; boolean_t isfromsnap, istosnap, fromorigin; boolean_t exclude = B_FALSE; FILE *fout = sdd->std_out ? stdout : stderr; err = 0; thissnap = strchr(zhp->zfs_name, '@') + 1; isfromsnap = (sdd->fromsnap != NULL && strcmp(sdd->fromsnap, thissnap) == 0); if (!sdd->seenfrom && isfromsnap) { gather_holds(zhp, sdd); sdd->seenfrom = B_TRUE; (void) strlcpy(sdd->prevsnap, thissnap, sizeof (sdd->prevsnap)); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (0); } if (sdd->seento || !sdd->seenfrom) { zfs_close(zhp); return (0); } istosnap = (strcmp(sdd->tosnap, thissnap) == 0); if (istosnap) sdd->seento = B_TRUE; if (sdd->large_block) flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (sdd->embed_data) flags |= LZC_SEND_FLAG_EMBED_DATA; if (sdd->compress) flags |= LZC_SEND_FLAG_COMPRESS; if (sdd->raw) flags |= LZC_SEND_FLAG_RAW; if (!sdd->doall && !isfromsnap && !istosnap) { if (sdd->replicate) { const char *snapname; nvlist_t *snapprops; /* * Filter out all intermediate snapshots except origin * snapshots needed to replicate clones. */ nvlist_t *nvfs = fsavl_find(sdd->fsavl, zhp->zfs_dmustats.dds_guid, &snapname); if (nvfs != NULL) { snapprops = fnvlist_lookup_nvlist(nvfs, "snapprops"); snapprops = fnvlist_lookup_nvlist(snapprops, thissnap); exclude = !nvlist_exists(snapprops, "is_clone_origin"); } } else { exclude = B_TRUE; } } /* * If a filter function exists, call it to determine whether * this snapshot will be sent. */ if (exclude || (sdd->filter_cb != NULL && sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) { /* * This snapshot is filtered out. Don't send it, and don't * set prevsnap_obj, so it will be as if this snapshot didn't * exist, and the next accepted snapshot will be sent as * an incremental from the last accepted one, or as the * first (and full) snapshot in the case of a replication, * non-incremental send. */ zfs_close(zhp); return (0); } gather_holds(zhp, sdd); fromorigin = sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate); if (sdd->verbosity != 0) { uint64_t size = 0; char fromds[ZFS_MAX_DATASET_NAME_LEN]; if (sdd->prevsnap[0] != '\0') { (void) strlcpy(fromds, zhp->zfs_name, sizeof (fromds)); *(strchr(fromds, '@') + 1) = '\0'; (void) strlcat(fromds, sdd->prevsnap, sizeof (fromds)); } if (zfs_send_space(zhp, zhp->zfs_name, sdd->prevsnap[0] ? fromds : NULL, flags, &size) == 0) { send_print_verbose(fout, zhp->zfs_name, sdd->prevsnap[0] ? sdd->prevsnap : NULL, size, sdd->parsable); sdd->size += size; } } if (!sdd->dryrun) { /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ sigset_t oldmask; { pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = sdd->verbosity; pa.pa_size = sdd->size; pa.pa_astitle = sdd->progressastitle; pa.pa_progress = sdd->progress; if ((err = pthread_create(&tid, NULL, send_progress_thread, &pa)) != 0) { zfs_close(zhp); return (err); } SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); if (send_progress_thread_exit(zhp->zfs_hdl, tid, &oldmask)) return (-1); } (void) strlcpy(sdd->prevsnap, thissnap, sizeof (sdd->prevsnap)); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (err); } /* * Send all snapshots for a filesystem, updating the send dump data. */ static int dump_filesystem(zfs_handle_t *zhp, send_dump_data_t *sdd) { int rv = 0; boolean_t missingfrom = B_FALSE; zfs_cmd_t zc = {"\0"}; uint64_t min_txg = 0, max_txg = 0; /* * Make sure the tosnap exists. */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->tosnap); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); sdd->err = B_TRUE; return (0); } /* * If this fs does not have fromsnap, and we're doing * recursive, we need to send a full stream from the * beginning (or an incremental from the origin if this * is a clone). If we're doing non-recursive, then let * them get the error. */ if (sdd->replicate && sdd->fromsnap) { /* * Make sure the fromsnap exists. */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->fromsnap); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) missingfrom = B_TRUE; } sdd->seenfrom = sdd->seento = B_FALSE; sdd->prevsnap[0] = '\0'; sdd->prevsnap_obj = 0; if (sdd->fromsnap == NULL || missingfrom) sdd->seenfrom = B_TRUE; /* * Iterate through all snapshots and process the ones we will be * sending. If we only have a "from" and "to" snapshot to deal * with, we can avoid iterating through all the other snapshots. */ if (sdd->doall || sdd->replicate || sdd->tosnap == NULL) { if (!sdd->replicate) { if (sdd->fromsnap != NULL) { min_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sdd->fromsnap); } if (sdd->tosnap != NULL) { max_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sdd->tosnap); } } rv = zfs_iter_snapshots_sorted_v2(zhp, 0, dump_snapshot, sdd, min_txg, max_txg); } else { char snapname[MAXPATHLEN] = { 0 }; zfs_handle_t *snap; /* Dump fromsnap. */ if (!sdd->seenfrom) { (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sdd->fromsnap); snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) rv = dump_snapshot(snap, sdd); else rv = errno; } /* Dump tosnap. */ if (rv == 0) { (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sdd->tosnap); snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) rv = dump_snapshot(snap, sdd); else rv = errno; } } if (!sdd->seenfrom) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) does not exist\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); sdd->err = B_TRUE; } else if (!sdd->seento) { if (sdd->fromsnap) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) " "is not earlier than it\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: " "could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); } sdd->err = B_TRUE; } return (rv); } /* * Send all snapshots for all filesystems in sdd. */ static int dump_filesystems(zfs_handle_t *rzhp, send_dump_data_t *sdd) { nvpair_t *fspair; boolean_t needagain, progress; if (!sdd->replicate) return (dump_filesystem(rzhp, sdd)); /* Mark the clone origin snapshots. */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *nvfs; uint64_t origin_guid = 0; nvfs = fnvpair_value_nvlist(fspair); (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid); if (origin_guid != 0) { const char *snapname; nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, &snapname); if (origin_nv != NULL) { nvlist_t *snapprops; snapprops = fnvlist_lookup_nvlist(origin_nv, "snapprops"); snapprops = fnvlist_lookup_nvlist(snapprops, snapname); fnvlist_add_boolean(snapprops, "is_clone_origin"); } } } again: needagain = progress = B_FALSE; for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist, *parent_nv; const char *fsname; zfs_handle_t *zhp; int err; uint64_t origin_guid = 0; uint64_t parent_guid = 0; fslist = fnvpair_value_nvlist(fspair); if (nvlist_lookup_boolean(fslist, "sent") == 0) continue; fsname = fnvlist_lookup_string(fslist, "name"); (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); (void) nvlist_lookup_uint64(fslist, "parentfromsnap", &parent_guid); if (parent_guid != 0) { parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL); if (!nvlist_exists(parent_nv, "sent")) { /* Parent has not been sent; skip this one. */ needagain = B_TRUE; continue; } } if (origin_guid != 0) { nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL); if (origin_nv != NULL && !nvlist_exists(origin_nv, "sent")) { /* * Origin has not been sent yet; * skip this clone. */ needagain = B_TRUE; continue; } } zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); err = dump_filesystem(zhp, sdd); fnvlist_add_boolean(fslist, "sent"); progress = B_TRUE; zfs_close(zhp); if (err) return (err); } if (needagain) { assert(progress); goto again; } /* Clean out the sent flags in case we reuse this fss. */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist; fslist = fnvpair_value_nvlist(fspair); (void) nvlist_remove_all(fslist, "sent"); } return (0); } nvlist_t * zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) { unsigned int version; int nread, i; unsigned long long checksum, packed_len; /* * Decode token header, which is: * -- * Note that the only supported token version is 1. */ nread = sscanf(token, "%u-%llx-%llx-", &version, &checksum, &packed_len); if (nread != 3) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid format)")); return (NULL); } if (version != ZFS_SEND_RESUME_TOKEN_VERSION) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid version %u)"), version); return (NULL); } /* Convert hexadecimal representation to binary. */ token = strrchr(token, '-') + 1; int len = strlen(token) / 2; unsigned char *compressed = zfs_alloc(hdl, len); for (i = 0; i < len; i++) { nread = sscanf(token + i * 2, "%2hhx", compressed + i); if (nread != 1) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt " "(payload is not hex-encoded)")); return (NULL); } } /* Verify checksum. */ zio_cksum_t cksum; fletcher_4_native_varsize(compressed, len, &cksum); if (cksum.zc_word[0] != checksum) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (incorrect checksum)")); return (NULL); } /* Uncompress. */ void *packed = zfs_alloc(hdl, packed_len); uLongf packed_len_long = packed_len; if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK || packed_len_long != packed_len) { free(packed); free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (decompression failed)")); return (NULL); } /* Unpack nvlist. */ nvlist_t *nv; int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP); free(packed); free(compressed); if (error != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (nvlist_unpack failed)")); return (NULL); } return (nv); } static enum lzc_send_flags lzc_flags_from_sendflags(const sendflags_t *flags) { enum lzc_send_flags lzc_flags = 0; if (flags->largeblock) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags->embed_data) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; if (flags->compress) lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (flags->raw) lzc_flags |= LZC_SEND_FLAG_RAW; if (flags->saved) lzc_flags |= LZC_SEND_FLAG_SAVED; return (lzc_flags); } static int estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, uint64_t resumeobj, uint64_t resumeoff, uint64_t bytes, const char *redactbook, char *errbuf, uint64_t *sizep) { uint64_t size; FILE *fout = flags->dryrun ? stdout : stderr; progress_arg_t pa = { 0 }; int err = 0; pthread_t ptid; sigset_t oldmask; { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_TRUE; pa.pa_verbosity = flags->verbosity; err = pthread_create(&ptid, NULL, send_progress_thread, &pa); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", zfs_strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = lzc_send_space_resume_redacted(zhp->zfs_name, from, lzc_flags_from_sendflags(flags), resumeobj, resumeoff, bytes, redactbook, fd, &size); *sizep = size; if (send_progress_thread_exit(zhp->zfs_hdl, ptid, &oldmask)) return (-1); if (!flags->progress && !flags->parsable) return (err); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", zfs_strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } send_print_verbose(fout, zhp->zfs_name, from, size, flags->parsable); if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)size); } else { char buf[16]; zfs_nicenum(size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, "total estimated size is %s\n"), buf); } return (0); } static boolean_t redact_snaps_contains(const uint64_t *snaps, uint64_t num_snaps, uint64_t guid) { for (int i = 0; i < num_snaps; i++) { if (snaps[i] == guid) return (B_TRUE); } return (B_FALSE); } static boolean_t redact_snaps_equal(const uint64_t *snaps1, uint64_t num_snaps1, const uint64_t *snaps2, uint64_t num_snaps2) { if (num_snaps1 != num_snaps2) return (B_FALSE); for (int i = 0; i < num_snaps1; i++) { if (!redact_snaps_contains(snaps2, num_snaps2, snaps1[i])) return (B_FALSE); } return (B_TRUE); } static int get_bookmarks(const char *path, nvlist_t **bmarksp) { nvlist_t *props = fnvlist_alloc(); int error; fnvlist_add_boolean(props, "redact_complete"); fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); error = lzc_get_bookmarks(path, props, bmarksp); fnvlist_free(props); return (error); } static nvpair_t * find_redact_pair(nvlist_t *bmarks, const uint64_t *redact_snap_guids, int num_redact_snaps) { nvpair_t *pair; for (pair = nvlist_next_nvpair(bmarks, NULL); pair; pair = nvlist_next_nvpair(bmarks, pair)) { nvlist_t *bmark = fnvpair_value_nvlist(pair); nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); uint_t len = 0; uint64_t *bmarksnaps = fnvlist_lookup_uint64_array(vallist, ZPROP_VALUE, &len); if (redact_snaps_equal(redact_snap_guids, num_redact_snaps, bmarksnaps, len)) { break; } } return (pair); } static boolean_t get_redact_complete(nvpair_t *pair) { nvlist_t *bmark = fnvpair_value_nvlist(pair); nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, "redact_complete"); boolean_t complete = fnvlist_lookup_boolean_value(vallist, ZPROP_VALUE); return (complete); } /* * Check that the list of redaction snapshots in the bookmark matches the send * we're resuming, and return whether or not it's complete. * * Note that the caller needs to free the contents of *bookname with free() if * this function returns successfully. */ static int find_redact_book(libzfs_handle_t *hdl, const char *path, const uint64_t *redact_snap_guids, int num_redact_snaps, char **bookname) { char errbuf[ERRBUFLEN]; nvlist_t *bmarks; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); int error = get_bookmarks(path, &bmarks); if (error != 0) { if (error == ESRCH) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "nonexistent redaction bookmark provided")); } else if (error == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset to be sent no longer exists")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unknown error: %s"), zfs_strerror(error)); } return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } nvpair_t *pair = find_redact_pair(bmarks, redact_snap_guids, num_redact_snaps); if (pair == NULL) { fnvlist_free(bmarks); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no appropriate redaction bookmark exists")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } boolean_t complete = get_redact_complete(pair); if (!complete) { fnvlist_free(bmarks); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incomplete redaction bookmark provided")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } *bookname = strndup(nvpair_name(pair), ZFS_MAX_DATASET_NAME_LEN); ASSERT3P(*bookname, !=, NULL); fnvlist_free(bmarks); return (0); } static enum lzc_send_flags lzc_flags_from_resume_nvl(nvlist_t *resume_nvl) { enum lzc_send_flags lzc_flags = 0; if (nvlist_exists(resume_nvl, "largeblockok")) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (nvlist_exists(resume_nvl, "embedok")) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; if (nvlist_exists(resume_nvl, "compressok")) lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (nvlist_exists(resume_nvl, "rawok")) lzc_flags |= LZC_SEND_FLAG_RAW; if (nvlist_exists(resume_nvl, "savedok")) lzc_flags |= LZC_SEND_FLAG_SAVED; return (lzc_flags); } static int zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, nvlist_t *resume_nvl) { char errbuf[ERRBUFLEN]; const char *toname; const char *fromname = NULL; uint64_t resumeobj, resumeoff, toguid, fromguid, bytes; zfs_handle_t *zhp; int error = 0; char name[ZFS_MAX_DATASET_NAME_LEN]; FILE *fout = (flags->verbosity > 0 && flags->dryrun) ? stdout : stderr; uint64_t *redact_snap_guids = NULL; int num_redact_snaps = 0; char *redact_book = NULL; uint64_t size = 0; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); if (flags->verbosity != 0) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "resume token contents:\n")); nvlist_print(fout, resume_nvl); } if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 || nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 || nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 || nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt")); return (zfs_error(hdl, EZFS_FAULT, errbuf)); } fromguid = 0; (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); if (flags->saved) { (void) strlcpy(name, toname, sizeof (name)); } else { error = guid_to_name(hdl, toname, toguid, B_FALSE, name); if (error != 0) { if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is no longer the same snapshot " "used in the initial send"), toname); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' used in the initial send no " "longer exists"), toname); } return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } } zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unable to access '%s'"), name); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } if (nvlist_lookup_uint64_array(resume_nvl, "book_redact_snaps", &redact_snap_guids, (uint_t *)&num_redact_snaps) != 0) { num_redact_snaps = -1; } if (fromguid != 0) { if (guid_to_name_redact_snaps(hdl, toname, fromguid, B_TRUE, redact_snap_guids, num_redact_snaps, name) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source %#llx no longer exists"), (longlong_t)fromguid); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } fromname = name; } redact_snap_guids = NULL; if (nvlist_lookup_uint64_array(resume_nvl, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snap_guids, (uint_t *)&num_redact_snaps) == 0) { char path[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(path, toname, sizeof (path)); char *at = strchr(path, '@'); ASSERT3P(at, !=, NULL); *at = '\0'; if ((error = find_redact_book(hdl, path, redact_snap_guids, num_redact_snaps, &redact_book)) != 0) { return (error); } } enum lzc_send_flags lzc_flags = lzc_flags_from_sendflags(flags) | lzc_flags_from_resume_nvl(resume_nvl); if (flags->verbosity != 0 || flags->progressastitle) { /* * Some of these may have come from the resume token, set them * here for size estimate purposes. */ sendflags_t tmpflags = *flags; if (lzc_flags & LZC_SEND_FLAG_LARGE_BLOCK) tmpflags.largeblock = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_COMPRESS) tmpflags.compress = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_EMBED_DATA) tmpflags.embed_data = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_RAW) tmpflags.raw = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_SAVED) tmpflags.saved = B_TRUE; error = estimate_size(zhp, fromname, outfd, &tmpflags, resumeobj, resumeoff, bytes, redact_book, errbuf, &size); } if (!flags->dryrun) { progress_arg_t pa = { 0 }; pthread_t tid; sigset_t oldmask; /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ { pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = flags->verbosity; pa.pa_size = size; pa.pa_astitle = flags->progressastitle; pa.pa_progress = flags->progress; error = pthread_create(&tid, NULL, send_progress_thread, &pa); if (error != 0) { if (redact_book != NULL) free(redact_book); zfs_close(zhp); return (error); } SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd, lzc_flags, resumeobj, resumeoff, redact_book); if (redact_book != NULL) free(redact_book); if (send_progress_thread_exit(hdl, tid, &oldmask)) { zfs_close(zhp); return (-1); } char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); zfs_close(zhp); switch (error) { case 0: return (0); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "source key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ESRCH: if (lzc_exists(zhp->zfs_name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source could not be found")); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EXDEV: case ENOENT: case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, "%s", zfs_strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } else { if (redact_book != NULL) free(redact_book); } zfs_close(zhp); return (error); } struct zfs_send_resume_impl { libzfs_handle_t *hdl; sendflags_t *flags; nvlist_t *resume_nvl; }; static int zfs_send_resume_impl_cb(int outfd, void *arg) { struct zfs_send_resume_impl *zsri = arg; return (zfs_send_resume_impl_cb_impl(zsri->hdl, zsri->flags, outfd, zsri->resume_nvl)); } static int zfs_send_resume_impl(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, nvlist_t *resume_nvl) { struct zfs_send_resume_impl zsri = { .hdl = hdl, .flags = flags, .resume_nvl = resume_nvl, }; return (lzc_send_wrapper(zfs_send_resume_impl_cb, outfd, &zsri)); } int zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, const char *resume_token) { int ret; char errbuf[ERRBUFLEN]; nvlist_t *resume_nvl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); if (resume_nvl == NULL) { /* * zfs_error_aux has already been set by * zfs_send_resume_token_to_nvlist() */ return (zfs_error(hdl, EZFS_FAULT, errbuf)); } ret = zfs_send_resume_impl(hdl, flags, outfd, resume_nvl); fnvlist_free(resume_nvl); return (ret); } int zfs_send_saved(zfs_handle_t *zhp, sendflags_t *flags, int outfd, const char *resume_token) { int ret; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *saved_nvl = NULL, *resume_nvl = NULL; uint64_t saved_guid = 0, resume_guid = 0; uint64_t obj = 0, off = 0, bytes = 0; char token_buf[ZFS_MAXPROPLEN]; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "saved send failed")); ret = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); if (ret != 0) goto out; saved_nvl = zfs_send_resume_token_to_nvlist(hdl, token_buf); if (saved_nvl == NULL) { /* * zfs_error_aux has already been set by * zfs_send_resume_token_to_nvlist() */ ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } /* * If a resume token is provided we use the object and offset * from that instead of the default, which starts from the * beginning. */ if (resume_token != NULL) { resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); if (resume_nvl == NULL) { ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (nvlist_lookup_uint64(resume_nvl, "object", &obj) != 0 || nvlist_lookup_uint64(resume_nvl, "offset", &off) != 0 || nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || nvlist_lookup_uint64(resume_nvl, "toguid", &resume_guid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "provided resume token is corrupt")); ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (nvlist_lookup_uint64(saved_nvl, "toguid", &saved_guid)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset's resume token is corrupt")); ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (resume_guid != saved_guid) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "provided resume token does not match dataset")); ret = zfs_error(hdl, EZFS_BADBACKUP, errbuf); goto out; } } (void) nvlist_remove_all(saved_nvl, "object"); fnvlist_add_uint64(saved_nvl, "object", obj); (void) nvlist_remove_all(saved_nvl, "offset"); fnvlist_add_uint64(saved_nvl, "offset", off); (void) nvlist_remove_all(saved_nvl, "bytes"); fnvlist_add_uint64(saved_nvl, "bytes", bytes); (void) nvlist_remove_all(saved_nvl, "toname"); fnvlist_add_string(saved_nvl, "toname", zhp->zfs_name); ret = zfs_send_resume_impl(hdl, flags, outfd, saved_nvl); out: fnvlist_free(saved_nvl); fnvlist_free(resume_nvl); return (ret); } /* * This function informs the target system that the recursive send is complete. * The record is also expected in the case of a send -p. */ static int send_conclusion_record(int fd, zio_cksum_t *zc) { dmu_replay_record_t drr; memset(&drr, 0, sizeof (dmu_replay_record_t)); drr.drr_type = DRR_END; if (zc != NULL) drr.drr_u.drr_end.drr_checksum = *zc; if (write(fd, &drr, sizeof (drr)) == -1) { return (errno); } return (0); } /* * This function is responsible for sending the records that contain the * necessary information for the target system's libzfs to be able to set the * properties of the filesystem being received, or to be able to prepare for * a recursive receive. * * The "zhp" argument is the handle of the snapshot we are sending * (the "tosnap"). The "from" argument is the short snapshot name (the part * after the @) of the incremental source. */ static int send_prelim_records(zfs_handle_t *zhp, const char *from, int fd, boolean_t gather_props, boolean_t recursive, boolean_t verbose, boolean_t dryrun, boolean_t raw, boolean_t replicate, boolean_t skipmissing, boolean_t backup, boolean_t holds, boolean_t props, boolean_t doall, nvlist_t **fssp, avl_tree_t **fsavlp) { int err = 0; char *packbuf = NULL; size_t buflen = 0; zio_cksum_t zc = { {0} }; int featureflags = 0; /* name of filesystem/volume that contains snapshot we are sending */ char tofs[ZFS_MAX_DATASET_NAME_LEN]; /* short name of snap we are sending */ const char *tosnap = ""; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && zfs_prop_get_int(zhp, ZFS_PROP_VERSION) >= ZPL_VERSION_SA) { featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } if (holds) featureflags |= DMU_BACKUP_FEATURE_HOLDS; (void) strlcpy(tofs, zhp->zfs_name, ZFS_MAX_DATASET_NAME_LEN); char *at = strchr(tofs, '@'); if (at != NULL) { *at = '\0'; tosnap = at + 1; } if (gather_props) { nvlist_t *hdrnv = fnvlist_alloc(); nvlist_t *fss = NULL; if (from != NULL) fnvlist_add_string(hdrnv, "fromsnap", from); fnvlist_add_string(hdrnv, "tosnap", tosnap); if (!recursive) fnvlist_add_boolean(hdrnv, "not_recursive"); if (raw) { fnvlist_add_boolean(hdrnv, "raw"); } if (gather_nvlist(zhp->zfs_hdl, tofs, from, tosnap, recursive, raw, doall, replicate, skipmissing, verbose, backup, holds, props, &fss, fsavlp) != 0) { return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } /* * Do not allow the size of the properties list to exceed * the limit */ if ((fnvlist_size(fss) + fnvlist_size(hdrnv)) > zhp->zfs_hdl->libzfs_max_nvlist) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s': " "the size of the list of snapshots and properties " "is too large to be received successfully.\n" "Select a smaller number of snapshots to send.\n"), zhp->zfs_name); return (zfs_error(zhp->zfs_hdl, EZFS_NOSPC, errbuf)); } fnvlist_add_nvlist(hdrnv, "fss", fss); VERIFY0(nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, 0)); if (fssp != NULL) { *fssp = fss; } else { fnvlist_free(fss); } fnvlist_free(hdrnv); } if (!dryrun) { dmu_replay_record_t drr; memset(&drr, 0, sizeof (dmu_replay_record_t)); /* write first begin record */ drr.drr_type = DRR_BEGIN; drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. drr_versioninfo, DMU_COMPOUNDSTREAM); DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. drr_versioninfo, featureflags); if (snprintf(drr.drr_u.drr_begin.drr_toname, sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", tofs, tosnap) >= sizeof (drr.drr_u.drr_begin.drr_toname)) { return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } drr.drr_payloadlen = buflen; err = dump_record(&drr, packbuf, buflen, &zc, fd); free(packbuf); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", zfs_strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } err = send_conclusion_record(fd, &zc); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", zfs_strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } } return (0); } /* * Generate a send stream. The "zhp" argument is the filesystem/volume * that contains the snapshot to send. The "fromsnap" argument is the * short name (the part after the '@') of the snapshot that is the * incremental source to send from (if non-NULL). The "tosnap" argument * is the short name of the snapshot to send. * * The content of the send stream is the snapshot identified by * 'tosnap'. Incremental streams are requested in two ways: * - from the snapshot identified by "fromsnap" (if non-null) or * - from the origin of the dataset identified by zhp, which must * be a clone. In this case, "fromsnap" is null and "fromorigin" * is TRUE. * * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM) * if "replicate" is set. If "doall" is set, dump all the intermediate * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall" * case too. If "props" is set, send properties. * * Pre-wrapped (cf. lzc_send_wrapper()). */ static int zfs_send_cb_impl(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, void *cb_arg, nvlist_t **debugnvp) { char errbuf[ERRBUFLEN]; send_dump_data_t sdd = { 0 }; int err = 0; nvlist_t *fss = NULL; avl_tree_t *fsavl = NULL; static uint64_t holdseq; int spa_version; FILE *fout; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot send '%s'"), zhp->zfs_name); if (fromsnap && fromsnap[0] == '\0') { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "zero-length incremental source")); return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); } if (fromsnap) { char full_fromsnap_name[ZFS_MAX_DATASET_NAME_LEN]; if (snprintf(full_fromsnap_name, sizeof (full_fromsnap_name), "%s@%s", zhp->zfs_name, fromsnap) >= sizeof (full_fromsnap_name)) { err = EINVAL; goto stderr_out; } zfs_handle_t *fromsnapn = zfs_open(zhp->zfs_hdl, full_fromsnap_name, ZFS_TYPE_SNAPSHOT); if (fromsnapn == NULL) { err = -1; goto err_out; } zfs_close(fromsnapn); } if (flags->replicate || flags->doall || flags->props || flags->holds || flags->backup) { char full_tosnap_name[ZFS_MAX_DATASET_NAME_LEN]; if (snprintf(full_tosnap_name, sizeof (full_tosnap_name), "%s@%s", zhp->zfs_name, tosnap) >= sizeof (full_tosnap_name)) { err = EINVAL; goto stderr_out; } zfs_handle_t *tosnap = zfs_open(zhp->zfs_hdl, full_tosnap_name, ZFS_TYPE_SNAPSHOT); if (tosnap == NULL) { err = -1; goto err_out; } err = send_prelim_records(tosnap, fromsnap, outfd, flags->replicate || flags->props || flags->holds, flags->replicate, flags->verbosity > 0, flags->dryrun, flags->raw, flags->replicate, flags->skipmissing, flags->backup, flags->holds, flags->props, flags->doall, &fss, &fsavl); zfs_close(tosnap); if (err != 0) goto err_out; } /* dump each stream */ sdd.fromsnap = fromsnap; sdd.tosnap = tosnap; sdd.outfd = outfd; sdd.replicate = flags->replicate; sdd.doall = flags->doall; sdd.fromorigin = flags->fromorigin; sdd.fss = fss; sdd.fsavl = fsavl; sdd.verbosity = flags->verbosity; sdd.parsable = flags->parsable; sdd.progress = flags->progress; sdd.progressastitle = flags->progressastitle; sdd.dryrun = flags->dryrun; sdd.large_block = flags->largeblock; sdd.embed_data = flags->embed_data; sdd.compress = flags->compress; sdd.raw = flags->raw; sdd.holds = flags->holds; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; if (debugnvp) sdd.debugnv = *debugnvp; if (sdd.verbosity != 0 && sdd.dryrun) sdd.std_out = B_TRUE; fout = sdd.std_out ? stdout : stderr; /* * Some flags require that we place user holds on the datasets that are * being sent so they don't get destroyed during the send. We can skip * this step if the pool is imported read-only since the datasets cannot * be destroyed. */ if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp), ZPOOL_PROP_READONLY, NULL) && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS && (flags->doall || flags->replicate)) { ++holdseq; (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); sdd.cleanup_fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); if (sdd.cleanup_fd < 0) { err = errno; goto stderr_out; } sdd.snapholds = fnvlist_alloc(); } else { sdd.cleanup_fd = -1; sdd.snapholds = NULL; } if (flags->verbosity != 0 || sdd.snapholds != NULL) { /* * Do a verbose no-op dry run to get all the verbose output * or to gather snapshot hold's before generating any data, * then do a non-verbose real run to generate the streams. */ sdd.dryrun = B_TRUE; err = dump_filesystems(zhp, &sdd); if (err != 0) goto stderr_out; if (flags->verbosity != 0) { if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)sdd.size); } else { char buf[16]; zfs_nicebytes(sdd.size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, "total estimated size is %s\n"), buf); } } /* Ensure no snaps found is treated as an error. */ if (!sdd.seento) { err = ENOENT; goto err_out; } /* Skip the second run if dryrun was requested. */ if (flags->dryrun) goto err_out; if (sdd.snapholds != NULL) { err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); if (err != 0) goto stderr_out; fnvlist_free(sdd.snapholds); sdd.snapholds = NULL; } sdd.dryrun = B_FALSE; sdd.verbosity = 0; } err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); fnvlist_free(fss); /* Ensure no snaps found is treated as an error. */ if (err == 0 && !sdd.seento) err = ENOENT; if (sdd.cleanup_fd != -1) { VERIFY(0 == close(sdd.cleanup_fd)); sdd.cleanup_fd = -1; } if (!flags->dryrun && (flags->replicate || flags->doall || flags->props || flags->backup || flags->holds)) { /* * write final end record. NB: want to do this even if * there was some error, because it might not be totally * failed. */ int err2 = send_conclusion_record(outfd, NULL); if (err2 != 0) return (zfs_standard_error(zhp->zfs_hdl, err2, errbuf)); } return (err || sdd.err); stderr_out: err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); err_out: fsavl_destroy(fsavl); fnvlist_free(fss); fnvlist_free(sdd.snapholds); if (sdd.cleanup_fd != -1) VERIFY(0 == close(sdd.cleanup_fd)); return (err); } struct zfs_send { zfs_handle_t *zhp; const char *fromsnap; const char *tosnap; sendflags_t *flags; snapfilter_cb_t *filter_func; void *cb_arg; nvlist_t **debugnvp; }; static int zfs_send_cb(int outfd, void *arg) { struct zfs_send *zs = arg; return (zfs_send_cb_impl(zs->zhp, zs->fromsnap, zs->tosnap, zs->flags, outfd, zs->filter_func, zs->cb_arg, zs->debugnvp)); } int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, void *cb_arg, nvlist_t **debugnvp) { struct zfs_send arg = { .zhp = zhp, .fromsnap = fromsnap, .tosnap = tosnap, .flags = flags, .filter_func = filter_func, .cb_arg = cb_arg, .debugnvp = debugnvp, }; return (lzc_send_wrapper(zfs_send_cb, outfd, &arg)); } static zfs_handle_t * name_to_dir_handle(libzfs_handle_t *hdl, const char *snapname) { char dirname[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(dirname, snapname, ZFS_MAX_DATASET_NAME_LEN); char *c = strchr(dirname, '@'); if (c != NULL) *c = '\0'; return (zfs_open(hdl, dirname, ZFS_TYPE_DATASET)); } /* * Returns B_TRUE if earlier is an earlier snapshot in later's timeline; either * an earlier snapshot in the same filesystem, or a snapshot before later's * origin, or it's origin's origin, etc. */ static boolean_t snapshot_is_before(zfs_handle_t *earlier, zfs_handle_t *later) { boolean_t ret; uint64_t later_txg = (later->zfs_type == ZFS_TYPE_FILESYSTEM || later->zfs_type == ZFS_TYPE_VOLUME ? UINT64_MAX : zfs_prop_get_int(later, ZFS_PROP_CREATETXG)); uint64_t earlier_txg = zfs_prop_get_int(earlier, ZFS_PROP_CREATETXG); if (earlier_txg >= later_txg) return (B_FALSE); zfs_handle_t *earlier_dir = name_to_dir_handle(earlier->zfs_hdl, earlier->zfs_name); zfs_handle_t *later_dir = name_to_dir_handle(later->zfs_hdl, later->zfs_name); if (strcmp(earlier_dir->zfs_name, later_dir->zfs_name) == 0) { zfs_close(earlier_dir); zfs_close(later_dir); return (B_TRUE); } char clonename[ZFS_MAX_DATASET_NAME_LEN]; if (zfs_prop_get(later_dir, ZFS_PROP_ORIGIN, clonename, ZFS_MAX_DATASET_NAME_LEN, NULL, NULL, 0, B_TRUE) != 0) { zfs_close(earlier_dir); zfs_close(later_dir); return (B_FALSE); } zfs_handle_t *origin = zfs_open(earlier->zfs_hdl, clonename, ZFS_TYPE_DATASET); uint64_t origin_txg = zfs_prop_get_int(origin, ZFS_PROP_CREATETXG); /* * If "earlier" is exactly the origin, then * snapshot_is_before(earlier, origin) will return false (because * they're the same). */ if (origin_txg == earlier_txg && strcmp(origin->zfs_name, earlier->zfs_name) == 0) { zfs_close(earlier_dir); zfs_close(later_dir); zfs_close(origin); return (B_TRUE); } zfs_close(earlier_dir); zfs_close(later_dir); ret = snapshot_is_before(earlier, origin); zfs_close(origin); return (ret); } /* * The "zhp" argument is the handle of the dataset to send (typically a * snapshot). The "from" argument is the full name of the snapshot or * bookmark that is the incremental source. * * Pre-wrapped (cf. lzc_send_wrapper()). */ static int zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, const char *redactbook) { int err; libzfs_handle_t *hdl = zhp->zfs_hdl; char *name = zhp->zfs_name; pthread_t ptid; progress_arg_t pa = { 0 }; uint64_t size = 0; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), name); if (from != NULL && strchr(from, '@')) { zfs_handle_t *from_zhp = zfs_open(hdl, from, ZFS_TYPE_DATASET); if (from_zhp == NULL) return (-1); if (!snapshot_is_before(from_zhp, zhp)) { zfs_close(from_zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } zfs_close(from_zhp); } if (redactbook != NULL) { char bookname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *redact_snaps; zfs_handle_t *book_zhp; char *at, *pound; int dsnamelen; pound = strchr(redactbook, '#'); if (pound != NULL) redactbook = pound + 1; at = strchr(name, '@'); if (at == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot do a redacted send to a filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } dsnamelen = at - name; if (snprintf(bookname, sizeof (bookname), "%.*s#%s", dsnamelen, name, redactbook) >= sizeof (bookname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid bookmark name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } book_zhp = zfs_open(hdl, bookname, ZFS_TYPE_BOOKMARK); if (book_zhp == NULL) return (-1); if (nvlist_lookup_nvlist(book_zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snaps) != 0 || redact_snaps == NULL) { zfs_close(book_zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not a redaction bookmark")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } zfs_close(book_zhp); } /* * Send fs properties */ if (flags->props || flags->holds || flags->backup) { /* * Note: the header generated by send_prelim_records() * assumes that the incremental source is in the same * filesystem/volume as the target (which is a requirement * when doing "zfs send -R"). But that isn't always the * case here (e.g. send from snap in origin, or send from * bookmark). We pass from=NULL, which will omit this * information from the prelim records; it isn't used * when receiving this type of stream. */ err = send_prelim_records(zhp, NULL, fd, B_TRUE, B_FALSE, flags->verbosity > 0, flags->dryrun, flags->raw, flags->replicate, B_FALSE, flags->backup, flags->holds, flags->props, flags->doall, NULL, NULL); if (err != 0) return (err); } /* * Perform size estimate if verbose was specified. */ if (flags->verbosity != 0 || flags->progressastitle) { err = estimate_size(zhp, from, fd, flags, 0, 0, 0, redactbook, errbuf, &size); if (err != 0) return (err); } if (flags->dryrun) return (0); /* * If progress reporting is requested, spawn a new thread to poll * ZFS_IOC_SEND_PROGRESS at a regular interval. */ sigset_t oldmask; { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = flags->verbosity; pa.pa_size = size; pa.pa_astitle = flags->progressastitle; pa.pa_progress = flags->progress; err = pthread_create(&ptid, NULL, send_progress_thread, &pa); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", zfs_strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask); } err = lzc_send_redacted(name, from, fd, lzc_flags_from_sendflags(flags), redactbook); if (send_progress_thread_exit(hdl, ptid, &oldmask)) return (-1); if (err == 0 && (flags->props || flags->holds || flags->backup)) { /* Write the final end record. */ err = send_conclusion_record(fd, NULL); if (err != 0) return (zfs_standard_error(hdl, err, errbuf)); } if (err != 0) { switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: case ESRCH: if (lzc_exists(name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (%s) does not exist"), from); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "target is busy; if a filesystem, " "it must not be mounted")); return (zfs_error(hdl, EZFS_BUSY, errbuf)); case EDQUOT: case EFAULT: case EFBIG: case EINVAL: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EROFS: zfs_error_aux(hdl, "%s", zfs_strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); - + case ZFS_ERR_STREAM_LARGE_MICROZAP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "source snapshot contains large microzaps, " + "need -L (--large-block) or -w (--raw) to " + "generate stream")); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } return (err != 0); } struct zfs_send_one { zfs_handle_t *zhp; const char *from; sendflags_t *flags; const char *redactbook; }; static int zfs_send_one_cb(int fd, void *arg) { struct zfs_send_one *zso = arg; return (zfs_send_one_cb_impl(zso->zhp, zso->from, fd, zso->flags, zso->redactbook)); } int zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, const char *redactbook) { struct zfs_send_one zso = { .zhp = zhp, .from = from, .flags = flags, .redactbook = redactbook, }; return (lzc_send_wrapper(zfs_send_one_cb, fd, &zso)); } /* * Routines specific to "zfs recv" */ static int recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, boolean_t byteswap, zio_cksum_t *zc) { char *cp = buf; int rv; int len = ilen; do { rv = read(fd, cp, len); cp += rv; len -= rv; } while (rv > 0); if (rv < 0 || len != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to read from stream")); return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN, "cannot receive"))); } if (zc) { if (byteswap) fletcher_4_incremental_byteswap(buf, ilen, zc); else fletcher_4_incremental_native(buf, ilen, zc); } return (0); } static int recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp, boolean_t byteswap, zio_cksum_t *zc) { char *buf; int err; buf = zfs_alloc(hdl, len); if (len > hdl->libzfs_max_nvlist) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "nvlist too large")); free(buf); return (ENOMEM); } err = recv_read(hdl, fd, buf, len, byteswap, zc); if (err != 0) { free(buf); return (err); } err = nvlist_unpack(buf, len, nvp, 0); free(buf); if (err != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (malformed nvlist)")); return (EINVAL); } return (0); } /* * Returns the grand origin (origin of origin of origin...) of a given handle. * If this dataset is not a clone, it simply returns a copy of the original * handle. */ static zfs_handle_t * recv_open_grand_origin(zfs_handle_t *zhp) { char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; zfs_handle_t *ozhp = zfs_handle_dup(zhp); while (ozhp != NULL) { if (zfs_prop_get(ozhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE) != 0) break; (void) zfs_close(ozhp); ozhp = zfs_open(zhp->zfs_hdl, origin, ZFS_TYPE_FILESYSTEM); } return (ozhp); } static int recv_rename_impl(zfs_handle_t *zhp, const char *name, const char *newname) { int err; zfs_handle_t *ozhp = NULL; /* * Attempt to rename the dataset. If it fails with EACCES we have * attempted to rename the dataset outside of its encryption root. * Force the dataset to become an encryption root and try again. */ err = lzc_rename(name, newname); if (err == EACCES) { ozhp = recv_open_grand_origin(zhp); if (ozhp == NULL) { err = ENOENT; goto out; } err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) goto out; err = lzc_rename(name, newname); } out: if (ozhp != NULL) zfs_close(ozhp); return (err); } static int recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, int baselen, char *newname, recvflags_t *flags) { static int seq; int err; prop_changelist_t *clp = NULL; zfs_handle_t *zhp = NULL; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { err = -1; goto out; } clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); if (clp == NULL) { err = -1; goto out; } err = changelist_prefix(clp); if (err) goto out; if (tryname) { (void) strlcpy(newname, tryname, ZFS_MAX_DATASET_NAME_LEN); if (flags->verbose) { (void) printf("attempting rename %s to %s\n", name, newname); } err = recv_rename_impl(zhp, name, newname); if (err == 0) changelist_rename(clp, name, tryname); } else { err = ENOENT; } if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) { seq++; (void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, "%.*srecv-%u-%u", baselen, name, getpid(), seq); if (flags->verbose) { (void) printf("failed - trying rename %s to %s\n", name, newname); } err = recv_rename_impl(zhp, name, newname); if (err == 0) changelist_rename(clp, name, newname); if (err && flags->verbose) { (void) printf("failed (%u) - " "will try again on next pass\n", errno); } err = EAGAIN; } else if (flags->verbose) { if (err == 0) (void) printf("success\n"); else (void) printf("failed (%u)\n", errno); } (void) changelist_postfix(clp); out: if (clp != NULL) changelist_free(clp); if (zhp != NULL) zfs_close(zhp); return (err); } static int recv_promote(libzfs_handle_t *hdl, const char *fsname, const char *origin_fsname, recvflags_t *flags) { int err; zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = NULL, *ozhp = NULL; if (flags->verbose) (void) printf("promoting %s\n", fsname); (void) strlcpy(zc.zc_value, origin_fsname, sizeof (zc.zc_value)); (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); /* * Attempt to promote the dataset. If it fails with EACCES the * promotion would cause this dataset to leave its encryption root. * Force the origin to become an encryption root and try again. */ err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); if (err == EACCES) { zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) { err = -1; goto out; } ozhp = recv_open_grand_origin(zhp); if (ozhp == NULL) { err = -1; goto out; } err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) goto out; err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); } out: if (zhp != NULL) zfs_close(zhp); if (ozhp != NULL) zfs_close(ozhp); return (err); } static int recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, char *newname, recvflags_t *flags) { int err = 0; prop_changelist_t *clp; zfs_handle_t *zhp; boolean_t defer = B_FALSE; int spa_version; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); zfs_type_t type = zfs_get_type(zhp); if (type == ZFS_TYPE_SNAPSHOT && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS) defer = B_TRUE; clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); zfs_close(zhp); if (clp == NULL) return (-1); err = changelist_prefix(clp); if (err) return (err); if (flags->verbose) (void) printf("attempting destroy %s\n", name); if (type == ZFS_TYPE_SNAPSHOT) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, name); err = lzc_destroy_snaps(nv, defer, NULL); fnvlist_free(nv); } else { err = lzc_destroy(name); } if (err == 0) { if (flags->verbose) (void) printf("success\n"); changelist_remove(clp, name); } (void) changelist_postfix(clp); changelist_free(clp); /* * Deferred destroy might destroy the snapshot or only mark it to be * destroyed later, and it returns success in either case. */ if (err != 0 || (defer && zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT))) { err = recv_rename(hdl, name, NULL, baselen, newname, flags); } return (err); } typedef struct guid_to_name_data { uint64_t guid; boolean_t bookmark_ok; char *name; char *skip; uint64_t *redact_snap_guids; uint64_t num_redact_snaps; } guid_to_name_data_t; static boolean_t redact_snaps_match(zfs_handle_t *zhp, guid_to_name_data_t *gtnd) { uint64_t *bmark_snaps; uint_t bmark_num_snaps; nvlist_t *nvl; if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) return (B_FALSE); nvl = fnvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); bmark_snaps = fnvlist_lookup_uint64_array(nvl, ZPROP_VALUE, &bmark_num_snaps); if (bmark_num_snaps != gtnd->num_redact_snaps) return (B_FALSE); int i = 0; for (; i < bmark_num_snaps; i++) { int j = 0; for (; j < bmark_num_snaps; j++) { if (bmark_snaps[i] == gtnd->redact_snap_guids[j]) break; } if (j == bmark_num_snaps) break; } return (i == bmark_num_snaps); } static int guid_to_name_cb(zfs_handle_t *zhp, void *arg) { guid_to_name_data_t *gtnd = arg; const char *slash; int err; if (gtnd->skip != NULL && (slash = strrchr(zhp->zfs_name, '/')) != NULL && strcmp(slash + 1, gtnd->skip) == 0) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid && (gtnd->num_redact_snaps == -1 || redact_snaps_match(zhp, gtnd))) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); } err = zfs_iter_children_v2(zhp, 0, guid_to_name_cb, gtnd); if (err != EEXIST && gtnd->bookmark_ok) err = zfs_iter_bookmarks_v2(zhp, 0, guid_to_name_cb, gtnd); zfs_close(zhp); return (err); } /* * Attempt to find the local dataset associated with this guid. In the case of * multiple matches, we attempt to find the "best" match by searching * progressively larger portions of the hierarchy. This allows one to send a * tree of datasets individually and guarantee that we will find the source * guid within that hierarchy, even if there are multiple matches elsewhere. * * If num_redact_snaps is not -1, we attempt to find a redaction bookmark with * the specified number of redaction snapshots. If num_redact_snaps isn't 0 or * -1, then redact_snap_guids will be an array of the guids of the snapshots the * redaction bookmark was created with. If num_redact_snaps is -1, then we will * attempt to find a snapshot or bookmark (if bookmark_ok is passed) with the * given guid. Note that a redaction bookmark can be returned if * num_redact_snaps == -1. */ static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, uint64_t num_redact_snaps, char *name) { char pname[ZFS_MAX_DATASET_NAME_LEN]; guid_to_name_data_t gtnd; gtnd.guid = guid; gtnd.bookmark_ok = bookmark_ok; gtnd.name = name; gtnd.skip = NULL; gtnd.redact_snap_guids = redact_snap_guids; gtnd.num_redact_snaps = num_redact_snaps; /* * Search progressively larger portions of the hierarchy, starting * with the filesystem specified by 'parent'. This will * select the "most local" version of the origin snapshot in the case * that there are multiple matching snapshots in the system. */ (void) strlcpy(pname, parent, sizeof (pname)); char *cp = strrchr(pname, '@'); if (cp == NULL) cp = strchr(pname, '\0'); for (; cp != NULL; cp = strrchr(pname, '/')) { /* Chop off the last component and open the parent */ *cp = '\0'; zfs_handle_t *zhp = make_dataset_handle(hdl, pname); if (zhp == NULL) continue; int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); if (err != EEXIST) err = zfs_iter_children_v2(zhp, 0, guid_to_name_cb, >nd); if (err != EEXIST && bookmark_ok) err = zfs_iter_bookmarks_v2(zhp, 0, guid_to_name_cb, >nd); zfs_close(zhp); if (err == EEXIST) return (0); /* * Remember the last portion of the dataset so we skip it next * time through (as we've already searched that portion of the * hierarchy). */ gtnd.skip = strrchr(pname, '/') + 1; } return (ENOENT); } static int guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, char *name) { return (guid_to_name_redact_snaps(hdl, parent, guid, bookmark_ok, NULL, -1, name)); } /* * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if * guid1 is after guid2. */ static int created_before(libzfs_handle_t *hdl, avl_tree_t *avl, uint64_t guid1, uint64_t guid2) { nvlist_t *nvfs; const char *fsname = NULL, *snapname = NULL; char buf[ZFS_MAX_DATASET_NAME_LEN]; int rv; zfs_handle_t *guid1hdl, *guid2hdl; uint64_t create1, create2; if (guid2 == 0) return (0); if (guid1 == 0) return (1); nvfs = fsavl_find(avl, guid1, &snapname); fsname = fnvlist_lookup_string(nvfs, "name"); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid1hdl == NULL) return (-1); nvfs = fsavl_find(avl, guid2, &snapname); fsname = fnvlist_lookup_string(nvfs, "name"); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid2hdl == NULL) { zfs_close(guid1hdl); return (-1); } create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG); create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG); if (create1 < create2) rv = -1; else if (create1 > create2) rv = +1; else rv = 0; zfs_close(guid1hdl); zfs_close(guid2hdl); return (rv); } /* * This function reestablishes the hierarchy of encryption roots after a * recursive incremental receive has completed. This must be done after the * second call to recv_incremental_replication() has renamed and promoted all * sent datasets to their final locations in the dataset hierarchy. */ static int recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *top_zfs, nvlist_t *stream_nv) { int err; nvpair_t *fselem = NULL; nvlist_t *stream_fss; stream_fss = fnvlist_lookup_nvlist(stream_nv, "fss"); while ((fselem = nvlist_next_nvpair(stream_fss, fselem)) != NULL) { zfs_handle_t *zhp = NULL; uint64_t crypt; nvlist_t *snaps, *props, *stream_nvfs = NULL; nvpair_t *snapel = NULL; boolean_t is_encroot, is_clone, stream_encroot; char *cp; const char *stream_keylocation = NULL; char keylocation[MAXNAMELEN]; char fsname[ZFS_MAX_DATASET_NAME_LEN]; keylocation[0] = '\0'; stream_nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(stream_nvfs, "snaps"); props = fnvlist_lookup_nvlist(stream_nvfs, "props"); stream_encroot = nvlist_exists(stream_nvfs, "is_encroot"); /* find a snapshot from the stream that exists locally */ err = ENOENT; while ((snapel = nvlist_next_nvpair(snaps, snapel)) != NULL) { uint64_t guid; guid = fnvpair_value_uint64(snapel); err = guid_to_name(hdl, top_zfs, guid, B_FALSE, fsname); if (err == 0) break; } if (err != 0) continue; cp = strchr(fsname, '@'); if (cp != NULL) *cp = '\0'; zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) { err = ENOENT; goto error; } crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); is_clone = zhp->zfs_dmustats.dds_origin[0] != '\0'; (void) zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); /* we don't need to do anything for unencrypted datasets */ if (crypt == ZIO_CRYPT_OFF) { zfs_close(zhp); continue; } /* * If the dataset is flagged as an encryption root, was not * received as a clone and is not currently an encryption root, * force it to become one. Fixup the keylocation if necessary. */ if (stream_encroot) { if (!is_clone && !is_encroot) { err = lzc_change_key(fsname, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) { zfs_close(zhp); goto error; } } stream_keylocation = fnvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)); /* * Refresh the properties in case the call to * lzc_change_key() changed the value. */ zfs_refresh_properties(zhp); err = zfs_prop_get(zhp, ZFS_PROP_KEYLOCATION, keylocation, sizeof (keylocation), NULL, NULL, 0, B_TRUE); if (err != 0) { zfs_close(zhp); goto error; } if (strcmp(keylocation, stream_keylocation) != 0) { err = zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), stream_keylocation); if (err != 0) { zfs_close(zhp); goto error; } } } /* * If the dataset is not flagged as an encryption root and is * currently an encryption root, force it to inherit from its * parent. The root of a raw send should never be * force-inherited. */ if (!stream_encroot && is_encroot && strcmp(top_zfs, fsname) != 0) { err = lzc_change_key(fsname, DCP_CMD_FORCE_INHERIT, NULL, NULL, 0); if (err != 0) { zfs_close(zhp); goto error; } } zfs_close(zhp); } return (0); error: return (err); } static int recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, nvlist_t *renamed) { nvlist_t *local_nv, *deleted = NULL; avl_tree_t *local_avl; nvpair_t *fselem, *nextfselem; const char *fromsnap; char newname[ZFS_MAX_DATASET_NAME_LEN]; char guidname[32]; int error; boolean_t needagain, progress, recursive; const char *s1, *s2; fromsnap = fnvlist_lookup_string(stream_nv, "fromsnap"); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); if (flags->dryrun) return (0); again: needagain = progress = B_FALSE; deleted = fnvlist_alloc(); if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, recursive, B_TRUE, B_FALSE, recursive, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE, &local_nv, &local_avl)) != 0) return (error); /* * Process deletes and renames */ for (fselem = nvlist_next_nvpair(local_nv, NULL); fselem; fselem = nextfselem) { nvlist_t *nvfs, *snaps; nvlist_t *stream_nvfs = NULL; nvpair_t *snapelem, *nextsnapelem; uint64_t fromguid = 0; uint64_t originguid = 0; uint64_t stream_originguid = 0; uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid; const char *fsname, *stream_fsname; nextfselem = nvlist_next_nvpair(local_nv, fselem); nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(nvfs, "snaps"); fsname = fnvlist_lookup_string(nvfs, "name"); parent_fromsnap_guid = fnvlist_lookup_uint64(nvfs, "parentfromsnap"); (void) nvlist_lookup_uint64(nvfs, "origin", &originguid); /* * First find the stream's fs, so we can check for * a different origin (due to "zfs promote") */ for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) { uint64_t thisguid; thisguid = fnvpair_value_uint64(snapelem); stream_nvfs = fsavl_find(stream_avl, thisguid, NULL); if (stream_nvfs != NULL) break; } /* check for promote */ (void) nvlist_lookup_uint64(stream_nvfs, "origin", &stream_originguid); if (stream_nvfs && originguid != stream_originguid) { switch (created_before(hdl, local_avl, stream_originguid, originguid)) { case 1: { /* promote it! */ nvlist_t *origin_nvfs; const char *origin_fsname; origin_nvfs = fsavl_find(local_avl, originguid, NULL); origin_fsname = fnvlist_lookup_string( origin_nvfs, "name"); error = recv_promote(hdl, fsname, origin_fsname, flags); if (error == 0) progress = B_TRUE; break; } default: break; case -1: fsavl_destroy(local_avl); fnvlist_free(local_nv); return (-1); } /* * We had/have the wrong origin, therefore our * list of snapshots is wrong. Need to handle * them on the next pass. */ needagain = B_TRUE; continue; } for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nextsnapelem) { uint64_t thisguid; const char *stream_snapname; nvlist_t *found, *props; nextsnapelem = nvlist_next_nvpair(snaps, snapelem); thisguid = fnvpair_value_uint64(snapelem); found = fsavl_find(stream_avl, thisguid, &stream_snapname); /* check for delete */ if (found == NULL) { char name[ZFS_MAX_DATASET_NAME_LEN]; if (!flags->force) continue; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); error = recv_destroy(hdl, name, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%llu", (u_longlong_t)thisguid); nvlist_add_boolean(deleted, guidname); continue; } stream_nvfs = found; if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops", &props) && 0 == nvlist_lookup_nvlist(props, stream_snapname, &props)) { zfs_cmd_t zc = {"\0"}; zc.zc_cookie = B_TRUE; /* received */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", fsname, nvpair_name(snapelem)); zcmd_write_src_nvlist(hdl, &zc, props); (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); zcmd_free_nvlists(&zc); } /* check for different snapname */ if (strcmp(nvpair_name(snapelem), stream_snapname) != 0) { char name[ZFS_MAX_DATASET_NAME_LEN]; char tryname[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); (void) snprintf(tryname, sizeof (name), "%s@%s", fsname, stream_snapname); error = recv_rename(hdl, name, tryname, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; } if (strcmp(stream_snapname, fromsnap) == 0) fromguid = thisguid; } /* check for delete */ if (stream_nvfs == NULL) { if (!flags->force) continue; error = recv_destroy(hdl, fsname, strlen(tofs)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%llu", (u_longlong_t)parent_fromsnap_guid); nvlist_add_boolean(deleted, guidname); continue; } if (fromguid == 0) { if (flags->verbose) { (void) printf("local fs %s does not have " "fromsnap (%s in stream); must have " "been deleted locally; ignoring\n", fsname, fromsnap); } continue; } stream_fsname = fnvlist_lookup_string(stream_nvfs, "name"); stream_parent_fromsnap_guid = fnvlist_lookup_uint64( stream_nvfs, "parentfromsnap"); s1 = strrchr(fsname, '/'); s2 = strrchr(stream_fsname, '/'); /* * Check if we're going to rename based on parent guid change * and the current parent guid was also deleted. If it was then * rename will fail and is likely unneeded, so avoid this and * force an early retry to determine the new * parent_fromsnap_guid. */ if (stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) { sprintf(guidname, "%llu", (u_longlong_t)parent_fromsnap_guid); if (nvlist_exists(deleted, guidname)) { progress = B_TRUE; needagain = B_TRUE; goto doagain; } } /* * Check for rename. If the exact receive path is specified, it * does not count as a rename, but we still need to check the * datasets beneath it. */ if ((stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) || ((flags->isprefix || strcmp(tofs, fsname) != 0) && (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { nvlist_t *parent; char tryname[ZFS_MAX_DATASET_NAME_LEN]; parent = fsavl_find(local_avl, stream_parent_fromsnap_guid, NULL); /* * NB: parent might not be found if we used the * tosnap for stream_parent_fromsnap_guid, * because the parent is a newly-created fs; * we'll be able to rename it after we recv the * new fs. */ if (parent != NULL) { const char *pname; pname = fnvlist_lookup_string(parent, "name"); (void) snprintf(tryname, sizeof (tryname), "%s%s", pname, strrchr(stream_fsname, '/')); } else { tryname[0] = '\0'; if (flags->verbose) { (void) printf("local fs %s new parent " "not found\n", fsname); } } newname[0] = '\0'; error = recv_rename(hdl, fsname, tryname, strlen(tofs)+1, newname, flags); if (renamed != NULL && newname[0] != '\0') { fnvlist_add_boolean(renamed, newname); } if (error) needagain = B_TRUE; else progress = B_TRUE; } } doagain: fsavl_destroy(local_avl); fnvlist_free(local_nv); fnvlist_free(deleted); if (needagain && progress) { /* do another pass to fix up temporary names */ if (flags->verbose) (void) printf("another pass:\n"); goto again; } return (needagain || error != 0); } static int zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc, char **top_zfs, nvlist_t *cmdprops) { nvlist_t *stream_nv = NULL; avl_tree_t *stream_avl = NULL; const char *fromsnap = NULL; const char *sendsnap = NULL; char *cp; char tofs[ZFS_MAX_DATASET_NAME_LEN]; char sendfs[ZFS_MAX_DATASET_NAME_LEN]; char errbuf[ERRBUFLEN]; dmu_replay_record_t drre; int error; boolean_t anyerr = B_FALSE; boolean_t softerr = B_FALSE; boolean_t recursive, raw; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); assert(drr->drr_type == DRR_BEGIN); assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC); assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) == DMU_COMPOUNDSTREAM); /* * Read in the nvlist from the stream. */ if (drr->drr_payloadlen != 0) { error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, &stream_nv, flags->byteswap, zc); if (error) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } } recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); raw = (nvlist_lookup_boolean(stream_nv, "raw") == 0); if (recursive && strchr(destname, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot stream")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } /* * Read in the end record and verify checksum. */ if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre), flags->byteswap, NULL))) goto out; if (flags->byteswap) { drre.drr_type = BSWAP_32(drre.drr_type); drre.drr_u.drr_end.drr_checksum.zc_word[0] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]); drre.drr_u.drr_end.drr_checksum.zc_word[1] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]); drre.drr_u.drr_end.drr_checksum.zc_word[2] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]); drre.drr_u.drr_end.drr_checksum.zc_word[3] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]); } if (drre.drr_type != DRR_END) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incorrect header checksum")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } (void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap); if (drr->drr_payloadlen != 0) { nvlist_t *stream_fss; stream_fss = fnvlist_lookup_nvlist(stream_nv, "fss"); if ((stream_avl = fsavl_create(stream_fss)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "couldn't allocate avl tree")); error = zfs_error(hdl, EZFS_NOMEM, errbuf); goto out; } if (fromsnap != NULL && recursive) { nvlist_t *renamed = NULL; nvpair_t *pair = NULL; (void) strlcpy(tofs, destname, sizeof (tofs)); if (flags->isprefix) { struct drr_begin *drrb = &drr->drr_u.drr_begin; int i; if (flags->istail) { cp = strrchr(drrb->drr_toname, '/'); if (cp == NULL) { (void) strlcat(tofs, "/", sizeof (tofs)); i = 0; } else { i = (cp - drrb->drr_toname); } } else { i = strcspn(drrb->drr_toname, "/@"); } /* zfs_receive_one() will create_parents() */ (void) strlcat(tofs, &drrb->drr_toname[i], sizeof (tofs)); *strchr(tofs, '@') = '\0'; } if (!flags->dryrun && !flags->nomount) { renamed = fnvlist_alloc(); } softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, renamed); /* Unmount renamed filesystems before receiving. */ while ((pair = nvlist_next_nvpair(renamed, pair)) != NULL) { zfs_handle_t *zhp; prop_changelist_t *clp = NULL; zhp = zfs_open(hdl, nvpair_name(pair), ZFS_TYPE_FILESYSTEM); if (zhp != NULL) { clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, 0, flags->forceunmount ? MS_FORCE : 0); zfs_close(zhp); if (clp != NULL) { softerr |= changelist_prefix(clp); changelist_free(clp); } } } fnvlist_free(renamed); } } /* * Get the fs specified by the first path in the stream (the top level * specified by 'zfs send') and pass it to each invocation of * zfs_receive_one(). */ (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname, sizeof (sendfs)); if ((cp = strchr(sendfs, '@')) != NULL) { *cp = '\0'; /* * Find the "sendsnap", the final snapshot in a replication * stream. zfs_receive_one() handles certain errors * differently, depending on if the contained stream is the * last one or not. */ sendsnap = (cp + 1); } /* Finally, receive each contained stream */ do { /* * we should figure out if it has a recoverable * error, in which case do a recv_skip() and drive on. * Note, if we fail due to already having this guid, * zfs_receive_one() will take care of it (ie, * recv_skip() and return 0). */ error = zfs_receive_impl(hdl, destname, NULL, flags, fd, sendfs, stream_nv, stream_avl, top_zfs, sendsnap, cmdprops); if (error == ENODATA) { error = 0; break; } anyerr |= error; } while (error == 0); if (drr->drr_payloadlen != 0 && recursive && fromsnap != NULL) { /* * Now that we have the fs's they sent us, try the * renames again. */ softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, NULL); } if (raw && softerr == 0 && *top_zfs != NULL) { softerr = recv_fix_encryption_hierarchy(hdl, *top_zfs, stream_nv); } out: fsavl_destroy(stream_avl); fnvlist_free(stream_nv); if (softerr) error = -2; if (anyerr) error = -1; return (error); } static void trunc_prop_errs(int truncated) { ASSERT(truncated != 0); if (truncated == 1) (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "1 more property could not be set\n")); else (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "%d more properties could not be set\n"), truncated); } static int recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) { dmu_replay_record_t *drr; void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); uint64_t payload_size; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); /* XXX would be great to use lseek if possible... */ drr = buf; while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t), byteswap, NULL) == 0) { if (byteswap) drr->drr_type = BSWAP_32(drr->drr_type); switch (drr->drr_type) { case DRR_BEGIN: if (drr->drr_payloadlen != 0) { (void) recv_read(hdl, fd, buf, drr->drr_payloadlen, B_FALSE, NULL); } break; case DRR_END: free(buf); return (0); case DRR_OBJECT: if (byteswap) { drr->drr_u.drr_object.drr_bonuslen = BSWAP_32(drr->drr_u.drr_object. drr_bonuslen); drr->drr_u.drr_object.drr_raw_bonuslen = BSWAP_32(drr->drr_u.drr_object. drr_raw_bonuslen); } payload_size = DRR_OBJECT_PAYLOAD_SIZE(&drr->drr_u.drr_object); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_WRITE: if (byteswap) { drr->drr_u.drr_write.drr_logical_size = BSWAP_64( drr->drr_u.drr_write.drr_logical_size); drr->drr_u.drr_write.drr_compressed_size = BSWAP_64( drr->drr_u.drr_write.drr_compressed_size); } payload_size = DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write); assert(payload_size <= SPA_MAXBLOCKSIZE); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_SPILL: if (byteswap) { drr->drr_u.drr_spill.drr_length = BSWAP_64(drr->drr_u.drr_spill.drr_length); drr->drr_u.drr_spill.drr_compressed_size = BSWAP_64(drr->drr_u.drr_spill. drr_compressed_size); } payload_size = DRR_SPILL_PAYLOAD_SIZE(&drr->drr_u.drr_spill); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_WRITE_EMBEDDED: if (byteswap) { drr->drr_u.drr_write_embedded.drr_psize = BSWAP_32(drr->drr_u.drr_write_embedded. drr_psize); } (void) recv_read(hdl, fd, buf, P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, 8), B_FALSE, NULL); break; case DRR_OBJECT_RANGE: case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid record type")); free(buf); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } } free(buf); return (-1); } static void recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, boolean_t resumable, boolean_t checksum) { char target_fs[ZFS_MAX_DATASET_NAME_LEN]; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, (checksum ? "checksum mismatch" : "incomplete stream"))); if (!resumable) return; (void) strlcpy(target_fs, target_snap, sizeof (target_fs)); *strchr(target_fs, '@') = '\0'; zfs_handle_t *zhp = zfs_open(hdl, target_fs, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return; char token_buf[ZFS_MAXPROPLEN]; int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); if (error == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "checksum mismatch or incomplete stream.\n" "Partially received snapshot is saved.\n" "A resuming stream can be generated on the sending " "system by running:\n" " zfs send -t %s"), token_buf); } zfs_close(zhp); } /* * Prepare a new nvlist of properties that are to override (-o) or be excluded * (-x) from the received dataset * recvprops: received properties from the send stream * cmdprops: raw input properties from command line * origprops: properties, both locally-set and received, currently set on the * target dataset if it exists, NULL otherwise. * oxprops: valid output override (-o) and excluded (-x) properties */ static int zfs_setup_cmdline_props(libzfs_handle_t *hdl, zfs_type_t type, char *fsname, boolean_t zoned, boolean_t recursive, boolean_t newfs, boolean_t raw, boolean_t toplevel, nvlist_t *recvprops, nvlist_t *cmdprops, nvlist_t *origprops, nvlist_t **oxprops, uint8_t **wkeydata_out, uint_t *wkeylen_out, const char *errbuf) { nvpair_t *nvp; nvlist_t *oprops, *voprops; zfs_handle_t *zhp = NULL; zpool_handle_t *zpool_hdl = NULL; char *cp; int ret = 0; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; if (nvlist_empty(cmdprops)) return (0); /* No properties to override or exclude */ *oxprops = fnvlist_alloc(); oprops = fnvlist_alloc(); strlcpy(namebuf, fsname, ZFS_MAX_DATASET_NAME_LEN); /* * Get our dataset handle. The target dataset may not exist yet. */ if (zfs_dataset_exists(hdl, namebuf, ZFS_TYPE_DATASET)) { zhp = zfs_open(hdl, namebuf, ZFS_TYPE_DATASET); if (zhp == NULL) { ret = -1; goto error; } } /* open the zpool handle */ cp = strchr(namebuf, '/'); if (cp != NULL) *cp = '\0'; zpool_hdl = zpool_open(hdl, namebuf); if (zpool_hdl == NULL) { ret = -1; goto error; } /* restore namebuf to match fsname for later use */ if (cp != NULL) *cp = '/'; /* * first iteration: process excluded (-x) properties now and gather * added (-o) properties to be later processed by zfs_valid_proplist() */ nvp = NULL; while ((nvp = nvlist_next_nvpair(cmdprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); /* * It turns out, if we don't normalize "aliased" names * e.g. compress= against the "real" names (e.g. compression) * here, then setting/excluding them does not work as * intended. * * But since user-defined properties wouldn't have a valid * mapping here, we do this conditional dance. */ const char *newname = name; if (prop >= ZFS_PROP_TYPE) newname = zfs_prop_to_name(prop); /* "origin" is processed separately, don't handle it here */ if (prop == ZFS_PROP_ORIGIN) continue; /* raw streams can't override encryption properties */ if ((zfs_prop_encryption_key_param(prop) || prop == ZFS_PROP_ENCRYPTION) && raw) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption property '%s' cannot " "be set or excluded for raw streams."), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* * For plain replicated send, we can ignore encryption * properties other than first stream */ if ((zfs_prop_encryption_key_param(prop) || prop == ZFS_PROP_ENCRYPTION) && !newfs && recursive && !raw) { continue; } /* incremental streams can only exclude encryption properties */ if ((zfs_prop_encryption_key_param(prop) || prop == ZFS_PROP_ENCRYPTION) && !newfs && nvpair_type(nvp) != DATA_TYPE_BOOLEAN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption property '%s' cannot " "be set for incremental streams."), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } switch (nvpair_type(nvp)) { case DATA_TYPE_BOOLEAN: /* -x property */ /* * DATA_TYPE_BOOLEAN is the way we're asked to "exclude" * a property: this is done by forcing an explicit * inherit on the destination so the effective value is * not the one we received from the send stream. */ if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && !zfs_prop_user(name)) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: %s: property '%s' does not " "apply to datasets of this type\n"), fsname, name); continue; } /* * We do this only if the property is not already * locally-set, in which case its value will take * priority over the received anyway. */ if (nvlist_exists(origprops, newname)) { nvlist_t *attrs; const char *source = NULL; attrs = fnvlist_lookup_nvlist(origprops, newname); if (nvlist_lookup_string(attrs, ZPROP_SOURCE, &source) == 0 && strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0) continue; } /* * We can't force an explicit inherit on non-inheritable * properties: if we're asked to exclude this kind of * values we remove them from "recvprops" input nvlist. */ if (!zfs_prop_user(name) && /* can be inherited too */ !zfs_prop_inheritable(prop) && nvlist_exists(recvprops, newname)) fnvlist_remove(recvprops, newname); else fnvlist_add_boolean(*oxprops, newname); break; case DATA_TYPE_STRING: /* -o property=value */ /* * we're trying to override a property that does not * make sense for this type of dataset, but we don't * want to fail if the receive is recursive: this comes * in handy when the send stream contains, for * instance, a child ZVOL and we're trying to receive * it with "-o atime=on" */ if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && !zfs_prop_user(name)) { if (recursive) continue; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' does not apply to datasets " "of this type"), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } fnvlist_add_string(oprops, newname, fnvpair_value_string(nvp)); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' must be a string or boolean"), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } if (toplevel) { /* convert override strings properties to native */ if ((voprops = zfs_valid_proplist(hdl, ZFS_TYPE_DATASET, oprops, zoned, zhp, zpool_hdl, B_FALSE, errbuf)) == NULL) { ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* * zfs_crypto_create() requires the parent name. Get it * by truncating the fsname copy stored in namebuf. */ cp = strrchr(namebuf, '/'); if (cp != NULL) *cp = '\0'; if (!raw && !(!newfs && recursive) && zfs_crypto_create(hdl, namebuf, voprops, NULL, B_FALSE, wkeydata_out, wkeylen_out) != 0) { fnvlist_free(voprops); ret = zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); goto error; } /* second pass: process "-o" properties */ fnvlist_merge(*oxprops, voprops); fnvlist_free(voprops); } else { /* override props on child dataset are inherited */ nvp = NULL; while ((nvp = nvlist_next_nvpair(oprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); fnvlist_add_boolean(*oxprops, name); } } error: if (zhp != NULL) zfs_close(zhp); if (zpool_hdl != NULL) zpool_close(zpool_hdl); fnvlist_free(oprops); return (ret); } /* * Restores a backup of tosnap from the file descriptor specified by infd. */ static int zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, const char *finalsnap, nvlist_t *cmdprops) { struct timespec begin_time; int ioctl_err, ioctl_errno, err; char *cp; struct drr_begin *drrb = &drr->drr_u.drr_begin; char errbuf[ERRBUFLEN]; const char *chopprefix; boolean_t newfs = B_FALSE; boolean_t stream_wantsnewfs, stream_resumingnewfs; boolean_t newprops = B_FALSE; uint64_t read_bytes = 0; uint64_t errflags = 0; uint64_t parent_snapguid = 0; prop_changelist_t *clp = NULL; nvlist_t *snapprops_nvlist = NULL; nvlist_t *snapholds_nvlist = NULL; zprop_errflags_t prop_errflags; nvlist_t *prop_errors = NULL; boolean_t recursive; const char *snapname = NULL; char destsnap[MAXPATHLEN * 2]; char origin[MAXNAMELEN] = {0}; char name[MAXPATHLEN]; char tmp_keylocation[MAXNAMELEN] = {0}; nvlist_t *rcvprops = NULL; /* props received from the send stream */ nvlist_t *oxprops = NULL; /* override (-o) and exclude (-x) props */ nvlist_t *origprops = NULL; /* original props (if destination exists) */ zfs_type_t type = ZFS_TYPE_INVALID; boolean_t toplevel = B_FALSE; boolean_t zoned = B_FALSE; boolean_t hastoken = B_FALSE; boolean_t redacted; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; #ifndef CLOCK_MONOTONIC_RAW #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC #endif clock_gettime(CLOCK_MONOTONIC_RAW, &begin_time); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); /* Did the user request holds be skipped via zfs recv -k? */ boolean_t holds = flags->holds && !flags->skipholds; if (stream_avl != NULL) { const char *keylocation = NULL; nvlist_t *lookup = NULL; nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid, &snapname); (void) nvlist_lookup_uint64(fs, "parentfromsnap", &parent_snapguid); err = nvlist_lookup_nvlist(fs, "props", &rcvprops); if (err) { rcvprops = fnvlist_alloc(); newprops = B_TRUE; } /* * The keylocation property may only be set on encryption roots, * but this dataset might not become an encryption root until * recv_fix_encryption_hierarchy() is called. That function * will fixup the keylocation anyway, so we temporarily unset * the keylocation for now to avoid any errors from the receive * ioctl. */ err = nvlist_lookup_string(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); if (err == 0) { strlcpy(tmp_keylocation, keylocation, MAXNAMELEN); (void) nvlist_remove_all(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)); } if (flags->canmountoff) { fnvlist_add_uint64(rcvprops, zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0); } else if (newprops) { /* nothing in rcvprops, eliminate it */ fnvlist_free(rcvprops); rcvprops = NULL; newprops = B_FALSE; } if (0 == nvlist_lookup_nvlist(fs, "snapprops", &lookup)) { snapprops_nvlist = fnvlist_lookup_nvlist(lookup, snapname); } if (holds) { if (0 == nvlist_lookup_nvlist(fs, "snapholds", &lookup)) { snapholds_nvlist = fnvlist_lookup_nvlist( lookup, snapname); } } } cp = NULL; /* * Determine how much of the snapshot name stored in the stream * we are going to tack on to the name they specified on the * command line, and how much we are going to chop off. * * If they specified a snapshot, chop the entire name stored in * the stream. */ if (flags->istail) { /* * A filesystem was specified with -e. We want to tack on only * the tail of the sent snapshot path. */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -e")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } chopprefix = strrchr(sendfs, '/'); if (chopprefix == NULL) { /* * The tail is the poolname, so we need to * prepend a path separator. */ int len = strlen(drrb->drr_toname); cp = umem_alloc(len + 2, UMEM_NOFAIL); cp[0] = '/'; (void) strcpy(&cp[1], drrb->drr_toname); chopprefix = cp; } else { chopprefix = drrb->drr_toname + (chopprefix - sendfs); } } else if (flags->isprefix) { /* * A filesystem was specified with -d. We want to tack on * everything but the first element of the sent snapshot path * (all but the pool name). */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -d")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } chopprefix = strchr(drrb->drr_toname, '/'); if (chopprefix == NULL) chopprefix = strchr(drrb->drr_toname, '@'); } else if (strchr(tosnap, '@') == NULL) { /* * If a filesystem was specified without -d or -e, we want to * tack on everything after the fs specified by 'zfs send'. */ chopprefix = drrb->drr_toname + strlen(sendfs); } else { /* A snapshot was specified as an exact path (no -d or -e). */ if (recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot " "stream")); err = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } chopprefix = drrb->drr_toname + strlen(drrb->drr_toname); } ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname); ASSERT(chopprefix > drrb->drr_toname || strchr(sendfs, '/') == NULL); ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname) || strchr(sendfs, '/') == NULL); ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' || chopprefix[0] == '\0'); /* * Determine name of destination snapshot. */ (void) strlcpy(destsnap, tosnap, sizeof (destsnap)); (void) strlcat(destsnap, chopprefix, sizeof (destsnap)); if (cp != NULL) umem_free(cp, strlen(cp) + 1); if (!zfs_name_valid(destsnap, ZFS_TYPE_SNAPSHOT)) { err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } /* * Determine the name of the origin snapshot. */ if (originsnap) { (void) strlcpy(origin, originsnap, sizeof (origin)); if (flags->verbose) (void) printf("using provided clone origin %s\n", origin); } else if (drrb->drr_flags & DRR_FLAG_CLONE) { if (guid_to_name(hdl, destsnap, drrb->drr_fromguid, B_FALSE, origin) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local origin for clone %s does not exist"), destsnap); err = zfs_error(hdl, EZFS_NOENT, errbuf); goto out; } if (flags->verbose) (void) printf("found clone origin %s\n", origin); } if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_DEDUP)) { (void) fprintf(stderr, gettext("ERROR: \"zfs receive\" no longer supports " "deduplicated send streams. Use\n" "the \"zstream redup\" command to convert this stream " "to a regular,\n" "non-deduplicated stream.\n")); err = zfs_error(hdl, EZFS_NOTSUP, errbuf); goto out; } boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RESUMING; boolean_t raw = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RAW; boolean_t embedded = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_EMBED_DATA; stream_wantsnewfs = (drrb->drr_fromguid == 0 || (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; stream_resumingnewfs = (drrb->drr_fromguid == 0 || (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && resuming; if (stream_wantsnewfs) { /* * if the parent fs does not exist, look for it based on * the parent snap GUID */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive new filesystem stream")); (void) strlcpy(name, destsnap, sizeof (name)); cp = strrchr(name, '/'); if (cp) *cp = '\0'; if (cp && !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { char suffix[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(suffix, strrchr(destsnap, '/'), sizeof (suffix)); if (guid_to_name(hdl, name, parent_snapguid, B_FALSE, destsnap) == 0) { *strchr(destsnap, '@') = '\0'; (void) strlcat(destsnap, suffix, sizeof (destsnap)); } } } else { /* * If the fs does not exist, look for it based on the * fromsnap GUID. */ if (resuming) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive resume stream")); } else { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive incremental stream")); } (void) strlcpy(name, destsnap, sizeof (name)); *strchr(name, '@') = '\0'; /* * If the exact receive path was specified and this is the * topmost path in the stream, then if the fs does not exist we * should look no further. */ if ((flags->isprefix || (*(chopprefix = drrb->drr_toname + strlen(sendfs)) != '\0' && *chopprefix != '@')) && !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { char snap[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(snap, strchr(destsnap, '@'), sizeof (snap)); if (guid_to_name(hdl, name, drrb->drr_fromguid, B_FALSE, destsnap) == 0) { *strchr(destsnap, '@') = '\0'; (void) strlcat(destsnap, snap, sizeof (destsnap)); } } } (void) strlcpy(name, destsnap, sizeof (name)); *strchr(name, '@') = '\0'; redacted = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; if (flags->heal) { if (flags->isprefix || flags->istail || flags->force || flags->canmountoff || flags->resumable || flags->nomount || flags->skipholds) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "corrective recv can not be used when combined with" " this flag")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } uint64_t guid = get_snap_guid(hdl, name, strchr(destsnap, '@') + 1); if (guid == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "corrective recv must specify an existing snapshot" " to heal")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } else if (guid != drrb->drr_toguid) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local snapshot doesn't match the snapshot" " in the provided stream")); err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); goto out; } } else if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = NULL; boolean_t encrypted; (void) strcpy(zc.zc_name, name); /* * Destination fs exists. It must be one of these cases: * - an incremental send stream * - the stream specifies a new fs (full stream or clone) * and they want us to blow away the existing fs (and * have therefore specified -F and removed any snapshots) * - we are resuming a failed receive. */ if (stream_wantsnewfs) { boolean_t is_volume = drrb->drr_type == DMU_OST_ZVOL; if (!flags->force) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' exists\n" "must specify -F to overwrite it"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (zfs_ioctl(hdl, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has snapshots (eg. %s)\n" "must destroy them to overwrite it"), zc.zc_name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (is_volume && strrchr(name, '/') == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s is the root dataset\n" "cannot overwrite with a ZVOL"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (is_volume && zfs_ioctl(hdl, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has children (eg. %s)\n" "cannot overwrite with a ZVOL"), zc.zc_name); err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); goto out; } } if ((zhp = zfs_open(hdl, name, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { err = -1; goto out; } /* * When receiving full/newfs on existing dataset, then it * should be done with "-F" flag. Its enforced for initial * receive in previous checks in this function. * Similarly, on resuming full/newfs recv on existing dataset, * it should be done with "-F" flag. * * When dataset doesn't exist, then full/newfs recv is done on * newly created dataset and it's marked INCONSISTENT. But * When receiving on existing dataset, recv is first done on * %recv and its marked INCONSISTENT. Existing dataset is not * marked INCONSISTENT. * Resume of full/newfs receive with dataset not INCONSISTENT * indicates that its resuming newfs on existing dataset. So, * enforce "-F" flag in this case. */ if (stream_resumingnewfs && !zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && !flags->force) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Resuming recv on existing destination '%s'\n" "must specify -F to overwrite it"), name); err = zfs_error(hdl, EZFS_RESUME_EXISTS, errbuf); goto out; } if (stream_wantsnewfs && zhp->zfs_dmustats.dds_origin[0]) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' is a clone\n" "must destroy it to overwrite it"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } /* * Raw sends can not be performed as an incremental on top * of existing unencrypted datasets. zfs recv -F can't be * used to blow away an existing encrypted filesystem. This * is because it would require the dsl dir to point to the * new key (or lack of a key) and the old key at the same * time. The -F flag may still be used for deleting * intermediate snapshots that would otherwise prevent the * receive from working. */ encrypted = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF; if (!stream_wantsnewfs && !encrypted && raw) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot perform raw receive on top of " "existing unencrypted dataset")); err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (stream_wantsnewfs && flags->force && ((raw && !encrypted) || encrypted)) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "zfs receive -F cannot be used to destroy an " "encrypted filesystem or overwrite an " "unencrypted one with an encrypted one")); err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && (stream_wantsnewfs || stream_resumingnewfs)) { /* We can't do online recv in this case */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->forceunmount ? MS_FORCE : 0); if (clp == NULL) { zfs_close(zhp); err = -1; goto out; } if (changelist_prefix(clp) != 0) { changelist_free(clp); zfs_close(zhp); err = -1; goto out; } } /* * If we are resuming a newfs, set newfs here so that we will * mount it if the recv succeeds this time. We can tell * that it was a newfs on the first recv because the fs * itself will be inconsistent (if the fs existed when we * did the first recv, we would have received it into * .../%recv). */ if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT)) newfs = B_TRUE; /* we want to know if we're zoned when validating -o|-x props */ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); /* may need this info later, get it now we have zhp around */ if (zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == 0) hastoken = B_TRUE; /* gather existing properties on destination */ origprops = fnvlist_alloc(); fnvlist_merge(origprops, zhp->zfs_props); fnvlist_merge(origprops, zhp->zfs_user_props); zfs_close(zhp); } else { zfs_handle_t *zhp; /* * Destination filesystem does not exist. Therefore we better * be creating a new filesystem (either from a full backup, or * a clone). It would therefore be invalid if the user * specified only the pool name (i.e. if the destination name * contained no slash character). */ cp = strrchr(name, '/'); if (!stream_wantsnewfs || cp == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' does not exist"), name); err = zfs_error(hdl, EZFS_NOENT, errbuf); goto out; } /* * Trim off the final dataset component so we perform the * recvbackup ioctl to the filesystems's parent. */ *cp = '\0'; if (flags->isprefix && !flags->istail && !flags->dryrun && create_parents(hdl, destsnap, strlen(tosnap)) != 0) { err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } /* validate parent */ zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent '%s' is not a filesystem"), name); err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); zfs_close(zhp); goto out; } zfs_close(zhp); newfs = B_TRUE; *cp = '/'; } if (flags->verbose) { (void) printf("%s %s%s stream of %s into %s\n", flags->dryrun ? "would receive" : "receiving", flags->heal ? "corrective " : "", drrb->drr_fromguid ? "incremental" : "full", drrb->drr_toname, destsnap); (void) fflush(stdout); } /* * If this is the top-level dataset, record it so we can use it * for recursive operations later. */ if (top_zfs != NULL && (*top_zfs == NULL || strcmp(*top_zfs, name) == 0)) { toplevel = B_TRUE; if (*top_zfs == NULL) *top_zfs = zfs_strdup(hdl, name); } if (drrb->drr_type == DMU_OST_ZVOL) { type = ZFS_TYPE_VOLUME; } else if (drrb->drr_type == DMU_OST_ZFS) { type = ZFS_TYPE_FILESYSTEM; } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid record type: 0x%d"), drrb->drr_type); err = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } if ((err = zfs_setup_cmdline_props(hdl, type, name, zoned, recursive, stream_wantsnewfs, raw, toplevel, rcvprops, cmdprops, origprops, &oxprops, &wkeydata, &wkeylen, errbuf)) != 0) goto out; /* * When sending with properties (zfs send -p), the encryption property * is not included because it is a SETONCE property and therefore * treated as read only. However, we are always able to determine its * value because raw sends will include it in the DRR_BDEGIN payload * and non-raw sends with properties are not allowed for encrypted * datasets. Therefore, if this is a non-raw properties stream, we can * infer that the value should be ZIO_CRYPT_OFF and manually add that * to the received properties. */ if (stream_wantsnewfs && !raw && rcvprops != NULL && !nvlist_exists(cmdprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) { if (oxprops == NULL) oxprops = fnvlist_alloc(); fnvlist_add_uint64(oxprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF); } if (flags->dryrun) { void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); /* * We have read the DRR_BEGIN record, but we have * not yet read the payload. For non-dryrun sends * this will be done by the kernel, so we must * emulate that here, before attempting to read * more records. */ err = recv_read(hdl, infd, buf, drr->drr_payloadlen, flags->byteswap, NULL); free(buf); if (err != 0) goto out; err = recv_skip(hdl, infd, flags->byteswap); goto out; } if (flags->heal) { err = ioctl_err = lzc_receive_with_heal(destsnap, rcvprops, oxprops, wkeydata, wkeylen, origin, flags->force, flags->heal, flags->resumable, raw, infd, drr_noswap, -1, &read_bytes, &errflags, NULL, &prop_errors); } else { err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable, raw, infd, drr_noswap, -1, &read_bytes, &errflags, NULL, &prop_errors); } ioctl_errno = ioctl_err; prop_errflags = errflags; if (err == 0) { nvpair_t *prop_err = NULL; while ((prop_err = nvlist_next_nvpair(prop_errors, prop_err)) != NULL) { char tbuf[1024]; zfs_prop_t prop; int intval; prop = zfs_name_to_prop(nvpair_name(prop_err)); (void) nvpair_value_int32(prop_err, &intval); if (strcmp(nvpair_name(prop_err), ZPROP_N_MORE_ERRORS) == 0) { trunc_prop_errs(intval); break; } else if (snapname == NULL || finalsnap == NULL || strcmp(finalsnap, snapname) == 0 || strcmp(nvpair_name(prop_err), zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) { /* * Skip the special case of, for example, * "refquota", errors on intermediate * snapshots leading up to a final one. * That's why we have all of the checks above. * * See zfs_ioctl.c's extract_delay_props() for * a list of props which can fail on * intermediate snapshots, but shouldn't * affect the overall receive. */ (void) snprintf(tbuf, sizeof (tbuf), dgettext(TEXT_DOMAIN, "cannot receive %s property on %s"), nvpair_name(prop_err), name); zfs_setprop_error(hdl, prop, intval, tbuf); } } } if (err == 0 && snapprops_nvlist) { zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, destsnap, sizeof (zc.zc_name)); zc.zc_cookie = B_TRUE; /* received */ zcmd_write_src_nvlist(hdl, &zc, snapprops_nvlist); (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); zcmd_free_nvlists(&zc); } if (err == 0 && snapholds_nvlist) { nvpair_t *pair; nvlist_t *holds, *errors = NULL; int cleanup_fd = -1; VERIFY(0 == nvlist_alloc(&holds, 0, KM_SLEEP)); for (pair = nvlist_next_nvpair(snapholds_nvlist, NULL); pair != NULL; pair = nvlist_next_nvpair(snapholds_nvlist, pair)) { fnvlist_add_string(holds, destsnap, nvpair_name(pair)); } (void) lzc_hold(holds, cleanup_fd, &errors); fnvlist_free(snapholds_nvlist); fnvlist_free(holds); } if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) { /* * It may be that this snapshot already exists, * in which case we want to consume & ignore it * rather than failing. */ avl_tree_t *local_avl; nvlist_t *local_nv, *fs; cp = strchr(destsnap, '@'); /* * XXX Do this faster by just iterating over snaps in * this fs. Also if zc_value does not exist, we will * get a strange "does not exist" error message. */ *cp = '\0'; if (gather_nvlist(hdl, destsnap, NULL, NULL, B_FALSE, B_TRUE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE, &local_nv, &local_avl) == 0) { *cp = '@'; fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); fsavl_destroy(local_avl); fnvlist_free(local_nv); if (fs != NULL) { if (flags->verbose) { (void) printf("snap %s already exists; " "ignoring\n", destsnap); } err = ioctl_err = recv_skip(hdl, infd, flags->byteswap); } } *cp = '@'; } if (ioctl_err != 0) { switch (ioctl_errno) { case ENODEV: cp = strchr(destsnap, '@'); *cp = '\0'; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "most recent snapshot of %s does not\n" "match incremental source"), destsnap); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); *cp = '@'; break; case ETXTBSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s has been modified\n" "since most recent snapshot"), name); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); break; case EACCES: if (flags->heal) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "key must be loaded to do a non-raw " "corrective recv on an encrypted " "dataset.")); } else if (raw && stream_wantsnewfs) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to create encryption key")); } else if (raw && !stream_wantsnewfs) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption key does not match " "existing key")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "inherited key must be loaded")); } (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); break; case EEXIST: cp = strchr(destsnap, '@'); if (newfs) { /* it's the containing fs that exists */ *cp = '\0'; } zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination already exists")); (void) zfs_error_fmt(hdl, EZFS_EXISTS, dgettext(TEXT_DOMAIN, "cannot restore to %s"), destsnap); *cp = '@'; break; case EINVAL: if (embedded && !raw) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incompatible embedded data stream " "feature with encrypted receive.")); } else if (flags->resumable) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "kernel modules must be upgraded to " "receive this stream.")); } (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: case ZFS_ERR_STREAM_TRUNCATED: if (flags->heal) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "corrective receive was not able to " "reconstruct the data needed for " "healing.")); else recv_ecksum_set_aux(hdl, destsnap, flags->resumable, ioctl_err == ECKSUM); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental send stream requires -L " "(--large-block), to match previous receive.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ENOTSUP: if (flags->heal) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream is not compatible with the " "data in the pool.")); else zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to receive this " "stream.")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case ZFS_ERR_CRYPTO_NOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream uses crypto parameters not compatible with " "this pool")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case EDQUOT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s space quota exceeded."), name); (void) zfs_error(hdl, EZFS_NOSPC, errbuf); break; case ZFS_ERR_FROM_IVSET_GUID_MISSING: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "IV set guid missing. See errata %u at " "https://openzfs.github.io/openzfs-docs/msg/" "ZFS-8000-ER."), ZPOOL_ERRATA_ZOL_8308_ENCRYPTION); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_FROM_IVSET_GUID_MISMATCH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "IV set guid mismatch. See the 'zfs receive' " "man page section\n discussing the limitations " "of raw encrypted send streams.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_SPILL_BLOCK_FLAG_MISSING: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Spill block flag missing for raw send.\n" "The zfs software on the sending system must " "be updated.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_RESUME_EXISTS: cp = strchr(destsnap, '@'); if (newfs) { /* it's the containing fs that exists */ *cp = '\0'; } zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Resuming recv on existing dataset without force")); (void) zfs_error_fmt(hdl, EZFS_RESUME_EXISTS, dgettext(TEXT_DOMAIN, "cannot resume recv %s"), destsnap); *cp = '@'; break; case E2BIG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "zfs receive required kernel memory allocation " "larger than the system can support. Please file " "an issue at the OpenZFS issue tracker:\n" "https://github.com/openzfs/zfs/issues/new")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case EBUSY: if (hastoken) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s contains " "partially-complete state from " "\"zfs receive -s\"."), name); (void) zfs_error(hdl, EZFS_BUSY, errbuf); break; } zfs_fallthrough; default: (void) zfs_standard_error(hdl, ioctl_errno, errbuf); } } /* * Mount the target filesystem (if created). Also mount any * children of the target filesystem if we did a replication * receive (indicated by stream_avl being non-NULL). */ if (clp) { if (!flags->nomount) err |= changelist_postfix(clp); changelist_free(clp); } if ((newfs || stream_avl) && type == ZFS_TYPE_FILESYSTEM && !redacted) flags->domount = B_TRUE; if (prop_errflags & ZPROP_ERR_NOCLEAR) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to clear unreceived properties on %s"), name); (void) fprintf(stderr, "\n"); } if (prop_errflags & ZPROP_ERR_NORESTORE) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to restore original properties on %s"), name); (void) fprintf(stderr, "\n"); } if (err || ioctl_err) { err = -1; goto out; } if (flags->verbose) { char buf1[64]; char buf2[64]; uint64_t bytes = read_bytes; struct timespec delta; clock_gettime(CLOCK_MONOTONIC_RAW, &delta); if (begin_time.tv_nsec > delta.tv_nsec) { delta.tv_nsec = 1000000000 + delta.tv_nsec - begin_time.tv_nsec; delta.tv_sec -= 1; } else delta.tv_nsec -= begin_time.tv_nsec; delta.tv_sec -= begin_time.tv_sec; if (delta.tv_sec == 0 && delta.tv_nsec == 0) delta.tv_nsec = 1; double delta_f = delta.tv_sec + (delta.tv_nsec / 1e9); zfs_nicebytes(bytes, buf1, sizeof (buf1)); zfs_nicebytes(bytes / delta_f, buf2, sizeof (buf2)); (void) printf("received %s stream in %.2f seconds (%s/sec)\n", buf1, delta_f, buf2); } err = 0; out: if (prop_errors != NULL) fnvlist_free(prop_errors); if (tmp_keylocation[0] != '\0') { fnvlist_add_string(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), tmp_keylocation); } if (newprops) fnvlist_free(rcvprops); fnvlist_free(oxprops); fnvlist_free(origprops); return (err); } /* * Check properties we were asked to override (both -o|-x) */ static boolean_t zfs_receive_checkprops(libzfs_handle_t *hdl, nvlist_t *props, const char *errbuf) { nvpair_t *nvp = NULL; zfs_prop_t prop; const char *name; while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { name = nvpair_name(nvp); prop = zfs_name_to_prop(name); if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s: invalid property '%s'"), errbuf, name); return (B_FALSE); } continue; } /* * "origin" is readonly but is used to receive datasets as * clones so we don't raise an error here */ if (prop == ZFS_PROP_ORIGIN) continue; /* encryption params have their own verification later */ if (prop == ZFS_PROP_ENCRYPTION || zfs_prop_encryption_key_param(prop)) continue; /* * cannot override readonly, set-once and other specific * settable properties */ if (zfs_prop_readonly(prop) || prop == ZFS_PROP_VERSION || prop == ZFS_PROP_VOLSIZE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s: invalid property '%s'"), errbuf, name); return (B_FALSE); } } return (B_TRUE); } static int zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, const char *finalsnap, nvlist_t *cmdprops) { int err; dmu_replay_record_t drr, drr_noswap; struct drr_begin *drrb = &drr.drr_u.drr_begin; char errbuf[ERRBUFLEN]; zio_cksum_t zcksum = { { 0 } }; uint64_t featureflags; int hdrtype; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); /* check cmdline props, raise an error if they cannot be received */ if (!zfs_receive_checkprops(hdl, cmdprops, errbuf)) return (zfs_error(hdl, EZFS_BADPROP, errbuf)); if (flags->isprefix && !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs " "(%s) does not exist"), tosnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } if (originsnap && !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " "(%s) does not exist"), originsnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* read in the BEGIN record */ if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, &zcksum))) return (err); if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) { /* It's the double end record at the end of a package */ return (ENODATA); } /* the kernel needs the non-byteswapped begin record */ drr_noswap = drr; flags->byteswap = B_FALSE; if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { /* * We computed the checksum in the wrong byteorder in * recv_read() above; do it again correctly. */ memset(&zcksum, 0, sizeof (zio_cksum_t)); fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum); flags->byteswap = B_TRUE; drr.drr_type = BSWAP_32(drr.drr_type); drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_flags = BSWAP_32(drrb->drr_flags); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad magic number)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo); if (!DMU_STREAM_SUPPORTED(featureflags) || (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) { /* * Let's be explicit about this one, since rather than * being a new feature we can't know, it's an old * feature we dropped. */ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream has deprecated feature: dedup, try " "'zstream redup [send in a file] | zfs recv " "[...]'")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream has unsupported feature, feature flags = " "%llx (unknown flags = %llx)"), (u_longlong_t)featureflags, (u_longlong_t)((featureflags) & ~DMU_BACKUP_FEATURE_MASK)); } return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } /* Holds feature is set once in the compound stream header. */ if (featureflags & DMU_BACKUP_FEATURE_HOLDS) flags->holds = B_TRUE; if (strchr(drrb->drr_toname, '@') == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad snapshot name)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) { char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN]; if (sendfs == NULL) { /* * We were not called from zfs_receive_package(). Get * the fs specified by 'zfs send'. */ char *cp; (void) strlcpy(nonpackage_sendfs, drr.drr_u.drr_begin.drr_toname, sizeof (nonpackage_sendfs)); if ((cp = strchr(nonpackage_sendfs, '@')) != NULL) *cp = '\0'; sendfs = nonpackage_sendfs; VERIFY(finalsnap == NULL); } return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, finalsnap, cmdprops)); } else { assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM); return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, &zcksum, top_zfs, cmdprops)); } } /* * Restores a backup of tosnap from the file descriptor specified by infd. * Return 0 on total success, -2 if some things couldn't be * destroyed/renamed/promoted, -1 if some things couldn't be received. * (-1 will override -2, if -1 and the resumable flag was specified the * transfer can be resumed if the sending side supports it). */ int zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, recvflags_t *flags, int infd, avl_tree_t *stream_avl) { char *top_zfs = NULL; int err; struct stat sb; const char *originsnap = NULL; /* * The only way fstat can fail is if we do not have a valid file * descriptor. */ if (fstat(infd, &sb) == -1) { perror("fstat"); return (-2); } if (props) { err = nvlist_lookup_string(props, "origin", &originsnap); if (err && err != ENOENT) return (err); } err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, stream_avl, &top_zfs, NULL, props); if (err == 0 && !flags->nomount && flags->domount && top_zfs) { zfs_handle_t *zhp = NULL; prop_changelist_t *clp = NULL; zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { err = -1; goto out; } else { if (zhp->zfs_type == ZFS_TYPE_VOLUME) { zfs_close(zhp); goto out; } clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, CL_GATHER_MOUNT_ALWAYS, flags->forceunmount ? MS_FORCE : 0); zfs_close(zhp); if (clp == NULL) { err = -1; goto out; } /* mount and share received datasets */ err = changelist_postfix(clp); changelist_free(clp); if (err != 0) err = -1; } } out: if (top_zfs) free(top_zfs); return (err); } diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 5a47cbbe22c2..cf6720317d9f 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1,2858 +1,2864 @@ .\" .\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2019 Datto Inc. .\" Copyright (c) 2023, 2024 Klara, Inc. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at .\" usr/src/OPENSOLARIS.LICENSE or https://opensource.org/licenses/CDDL-1.0. .\" .\" See the License for the specific language governing permissions and .\" limitations under the License. When distributing Covered Code, include this .\" CDDL HEADER in each file and include the License file at .\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd June 27, 2024 +.\" Copyright (c) 2024, Klara, Inc. +.\" +.Dd October 2, 2024 .Dt ZFS 4 .Os . .Sh NAME .Nm zfs .Nd tuning of the ZFS kernel module . .Sh DESCRIPTION The ZFS module supports these parameters: .Bl -tag -width Ds .It Sy dbuf_cache_max_bytes Ns = Ns Sy UINT64_MAX Ns B Pq u64 Maximum size in bytes of the dbuf cache. The target size is determined by the MIN versus .No 1/2^ Ns Sy dbuf_cache_shift Pq 1/32nd of the target ARC size. The behavior of the dbuf cache and its associated settings can be observed via the .Pa /proc/spl/kstat/zfs/dbufstats kstat. . .It Sy dbuf_metadata_cache_max_bytes Ns = Ns Sy UINT64_MAX Ns B Pq u64 Maximum size in bytes of the metadata dbuf cache. The target size is determined by the MIN versus .No 1/2^ Ns Sy dbuf_metadata_cache_shift Pq 1/64th of the target ARC size. The behavior of the metadata dbuf cache and its associated settings can be observed via the .Pa /proc/spl/kstat/zfs/dbufstats kstat. . .It Sy dbuf_cache_hiwater_pct Ns = Ns Sy 10 Ns % Pq uint The percentage over .Sy dbuf_cache_max_bytes when dbufs must be evicted directly. . .It Sy dbuf_cache_lowater_pct Ns = Ns Sy 10 Ns % Pq uint The percentage below .Sy dbuf_cache_max_bytes when the evict thread stops evicting dbufs. . .It Sy dbuf_cache_shift Ns = Ns Sy 5 Pq uint Set the size of the dbuf cache .Pq Sy dbuf_cache_max_bytes to a log2 fraction of the target ARC size. . .It Sy dbuf_metadata_cache_shift Ns = Ns Sy 6 Pq uint Set the size of the dbuf metadata cache .Pq Sy dbuf_metadata_cache_max_bytes to a log2 fraction of the target ARC size. . .It Sy dbuf_mutex_cache_shift Ns = Ns Sy 0 Pq uint Set the size of the mutex array for the dbuf cache. When set to .Sy 0 the array is dynamically sized based on total system memory. . .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint dnode slots allocated in a single operation as a power of 2. The default value minimizes lock contention for the bulk operation performed. . .It Sy dmu_ddt_copies Ns = Ns Sy 3 Pq uint Controls the number of copies stored for DeDup Table .Pq DDT objects. Reducing the number of copies to 1 from the previous default of 3 can reduce the write inflation caused by deduplication. This assumes redundancy for this data is provided by the vdev layer. If the DDT is damaged, space may be leaked .Pq not freed when the DDT can not report the correct reference count. . .It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Limit the amount we can prefetch with one call to this amount in bytes. This helps to limit the amount of memory that can be used by prefetching. . .It Sy ignore_hole_birth Pq int Alias for .Sy send_holes_without_birth_time . . .It Sy l2arc_feed_again Ns = Ns Sy 1 Ns | Ns 0 Pq int Turbo L2ARC warm-up. When the L2ARC is cold the fill interval will be set as fast as possible. . .It Sy l2arc_feed_min_ms Ns = Ns Sy 200 Pq u64 Min feed interval in milliseconds. Requires .Sy l2arc_feed_again Ns = Ns Ar 1 and only applicable in related situations. . .It Sy l2arc_feed_secs Ns = Ns Sy 1 Pq u64 Seconds between L2ARC writing. . .It Sy l2arc_headroom Ns = Ns Sy 8 Pq u64 How far through the ARC lists to search for L2ARC cacheable content, expressed as a multiplier of .Sy l2arc_write_max . ARC persistence across reboots can be achieved with persistent L2ARC by setting this parameter to .Sy 0 , allowing the full length of ARC lists to be searched for cacheable content. . .It Sy l2arc_headroom_boost Ns = Ns Sy 200 Ns % Pq u64 Scales .Sy l2arc_headroom by this percentage when L2ARC contents are being successfully compressed before writing. A value of .Sy 100 disables this feature. . .It Sy l2arc_exclude_special Ns = Ns Sy 0 Ns | Ns 1 Pq int Controls whether buffers present on special vdevs are eligible for caching into L2ARC. If set to 1, exclude dbufs on special vdevs from being cached to L2ARC. . .It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int Controls whether only MFU metadata and data are cached from ARC into L2ARC. This may be desired to avoid wasting space on L2ARC when reading/writing large amounts of data that are not expected to be accessed more than once. .Pp The default is 0, meaning both MRU and MFU data and metadata are cached. When turning off this feature (setting it to 0), some MRU buffers will still be present in ARC and eventually cached on L2ARC. .No If Sy l2arc_noprefetch Ns = Ns Sy 0 , some prefetched buffers will be cached to L2ARC, and those might later transition to MRU, in which case the .Sy l2arc_mru_asize No arcstat will not be Sy 0 . .Pp Setting it to 1 means to L2 cache only MFU data and metadata. .Pp Setting it to 2 means to L2 cache all metadata (MRU+MFU) but only MFU data (ie: MRU data are not cached). This can be the right setting to cache as much metadata as possible even when having high data turnover. .Pp Regardless of .Sy l2arc_noprefetch , some MFU buffers might be evicted from ARC, accessed later on as prefetches and transition to MRU as prefetches. If accessed again they are counted as MRU and the .Sy l2arc_mru_asize No arcstat will not be Sy 0 . .Pp The ARC status of L2ARC buffers when they were first cached in L2ARC can be seen in the .Sy l2arc_mru_asize , Sy l2arc_mfu_asize , No and Sy l2arc_prefetch_asize arcstats when importing the pool or onlining a cache device if persistent L2ARC is enabled. .Pp The .Sy evict_l2_eligible_mru arcstat does not take into account if this option is enabled as the information provided by the .Sy evict_l2_eligible_m[rf]u arcstats can be used to decide if toggling this option is appropriate for the current workload. . .It Sy l2arc_meta_percent Ns = Ns Sy 33 Ns % Pq uint Percent of ARC size allowed for L2ARC-only headers. Since L2ARC buffers are not evicted on memory pressure, too many headers on a system with an irrationally large L2ARC can render it slow or unusable. This parameter limits L2ARC writes and rebuilds to achieve the target. . .It Sy l2arc_trim_ahead Ns = Ns Sy 0 Ns % Pq u64 Trims ahead of the current write size .Pq Sy l2arc_write_max on L2ARC devices by this percentage of write size if we have filled the device. If set to .Sy 100 we TRIM twice the space required to accommodate upcoming writes. A minimum of .Sy 64 MiB will be trimmed. It also enables TRIM of the whole L2ARC device upon creation or addition to an existing pool or if the header of the device is invalid upon importing a pool or onlining a cache device. A value of .Sy 0 disables TRIM on L2ARC altogether and is the default as it can put significant stress on the underlying storage devices. This will vary depending of how well the specific device handles these commands. . .It Sy l2arc_noprefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int Do not write buffers to L2ARC if they were prefetched but not used by applications. In case there are prefetched buffers in L2ARC and this option is later set, we do not read the prefetched buffers from L2ARC. Unsetting this option is useful for caching sequential reads from the disks to L2ARC and serve those reads from L2ARC later on. This may be beneficial in case the L2ARC device is significantly faster in sequential reads than the disks of the pool. .Pp Use .Sy 1 to disable and .Sy 0 to enable caching/reading prefetches to/from L2ARC. . .It Sy l2arc_norw Ns = Ns Sy 0 Ns | Ns 1 Pq int No reads during writes. . .It Sy l2arc_write_boost Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64 Cold L2ARC devices will have .Sy l2arc_write_max increased by this amount while they remain cold. . .It Sy l2arc_write_max Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64 Max write bytes per interval. . .It Sy l2arc_rebuild_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Rebuild the L2ARC when importing a pool (persistent L2ARC). This can be disabled if there are problems importing a pool or attaching an L2ARC device (e.g. the L2ARC device is slow in reading stored log metadata, or the metadata has become somehow fragmented/unusable). . .It Sy l2arc_rebuild_blocks_min_l2size Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 Mininum size of an L2ARC device required in order to write log blocks in it. The log blocks are used upon importing the pool to rebuild the persistent L2ARC. .Pp For L2ARC devices less than 1 GiB, the amount of data .Fn l2arc_evict evicts is significant compared to the amount of restored L2ARC data. In this case, do not write log blocks in L2ARC in order not to waste space. . .It Sy metaslab_aliquot Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Metaslab granularity, in bytes. This is roughly similar to what would be referred to as the "stripe size" in traditional RAID arrays. In normal operation, ZFS will try to write this amount of data to each disk before moving on to the next top-level vdev. . .It Sy metaslab_bias_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable metaslab group biasing based on their vdevs' over- or under-utilization relative to the pool. . .It Sy metaslab_force_ganging Ns = Ns Sy 16777217 Ns B Po 16 MiB + 1 B Pc Pq u64 Make some blocks above a certain size be gang blocks. This option is used by the test suite to facilitate testing. . .It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint For blocks that could be forced to be a gang block (due to .Sy metaslab_force_ganging ) , force this many of them to be gang blocks. . .It Sy brt_zap_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int Controls prefetching BRT records for blocks which are going to be cloned. . .It Sy brt_zap_default_bs Ns = Ns Sy 12 Po 4 KiB Pc Pq int Default BRT ZAP data block size as a power of 2. Note that changing this after creating a BRT on the pool will not affect existing BRTs, only newly created ones. . .It Sy brt_zap_default_ibs Ns = Ns Sy 12 Po 4 KiB Pc Pq int Default BRT ZAP indirect block size as a power of 2. Note that changing this after creating a BRT on the pool will not affect existing BRTs, only newly created ones. . .It Sy ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP data block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created ones. . .It Sy ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP indirect block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created ones. . .It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int Default dnode block size as a power of 2. . .It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int Default dnode indirect block size as a power of 2. . .It Sy zfs_dio_enabled Ns = Ns Sy 0 Ns | Ns 1 Pq int Enable Direct I/O. If this setting is 0, then all I/O requests will be directed through the ARC acting as though the dataset property .Sy direct was set to .Sy disabled . . .It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 When attempting to log an output nvlist of an ioctl in the on-disk history, the output will not be stored if it is larger than this size (in bytes). This must be less than .Sy DMU_MAX_ACCESS Pq 64 MiB . This applies primarily to .Fn zfs_ioc_channel_program Pq cf. Xr zfs-program 8 . . .It Sy zfs_keep_log_spacemaps_at_export Ns = Ns Sy 0 Ns | Ns 1 Pq int Prevent log spacemaps from being destroyed during pool exports and destroys. . .It Sy zfs_metaslab_segment_weight_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable/disable segment-based metaslab selection. . .It Sy zfs_metaslab_switch_threshold Ns = Ns Sy 2 Pq int When using segment-based metaslab selection, continue allocating from the active metaslab until this option's worth of buckets have been exhausted. . .It Sy metaslab_debug_load Ns = Ns Sy 0 Ns | Ns 1 Pq int Load all metaslabs during pool import. . .It Sy metaslab_debug_unload Ns = Ns Sy 0 Ns | Ns 1 Pq int Prevent metaslabs from being unloaded. . .It Sy metaslab_fragmentation_factor_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable use of the fragmentation metric in computing metaslab weights. . .It Sy metaslab_df_max_search Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint Maximum distance to search forward from the last offset. Without this limit, fragmented pools can see .Em >100`000 iterations and .Fn metaslab_block_picker becomes the performance limiting factor on high-performance storage. .Pp With the default setting of .Sy 16 MiB , we typically see less than .Em 500 iterations, even with very fragmented .Sy ashift Ns = Ns Sy 9 pools. The maximum number of iterations possible is .Sy metaslab_df_max_search / 2^(ashift+1) . With the default setting of .Sy 16 MiB this is .Em 16*1024 Pq with Sy ashift Ns = Ns Sy 9 or .Em 2*1024 Pq with Sy ashift Ns = Ns Sy 12 . . .It Sy metaslab_df_use_largest_segment Ns = Ns Sy 0 Ns | Ns 1 Pq int If not searching forward (due to .Sy metaslab_df_max_search , metaslab_df_free_pct , .No or Sy metaslab_df_alloc_threshold ) , this tunable controls which segment is used. If set, we will use the largest free segment. If unset, we will use a segment of at least the requested size. . .It Sy zfs_metaslab_max_size_cache_sec Ns = Ns Sy 3600 Ns s Po 1 hour Pc Pq u64 When we unload a metaslab, we cache the size of the largest free chunk. We use that cached size to determine whether or not to load a metaslab for a given allocation. As more frees accumulate in that metaslab while it's unloaded, the cached max size becomes less and less accurate. After a number of seconds controlled by this tunable, we stop considering the cached max size and start considering only the histogram instead. . .It Sy zfs_metaslab_mem_limit Ns = Ns Sy 25 Ns % Pq uint When we are loading a new metaslab, we check the amount of memory being used to store metaslab range trees. If it is over a threshold, we attempt to unload the least recently used metaslab to prevent the system from clogging all of its memory with range trees. This tunable sets the percentage of total system memory that is the threshold. . .It Sy zfs_metaslab_try_hard_before_gang Ns = Ns Sy 0 Ns | Ns 1 Pq int .Bl -item -compact .It If unset, we will first try normal allocation. .It If that fails then we will do a gang allocation. .It If that fails then we will do a "try hard" gang allocation. .It If that fails then we will have a multi-layer gang block. .El .Pp .Bl -item -compact .It If set, we will first try normal allocation. .It If that fails then we will do a "try hard" allocation. .It If that fails we will do a gang allocation. .It If that fails we will do a "try hard" gang allocation. .It If that fails then we will have a multi-layer gang block. .El . .It Sy zfs_metaslab_find_max_tries Ns = Ns Sy 100 Pq uint When not trying hard, we only consider this number of the best metaslabs. This improves performance, especially when there are many metaslabs per vdev and the allocation can't actually be satisfied (so we would otherwise iterate all metaslabs). . .It Sy zfs_vdev_default_ms_count Ns = Ns Sy 200 Pq uint When a vdev is added, target this number of metaslabs per top-level vdev. . .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512 MiB Pc Pq uint Default lower limit for metaslab size. . .It Sy zfs_vdev_max_ms_shift Ns = Ns Sy 34 Po 16 GiB Pc Pq uint Default upper limit for metaslab size. . .It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq uint Maximum ashift used when optimizing for logical \[->] physical sector size on new top-level vdevs. May be increased up to .Sy ASHIFT_MAX Po 16 Pc , but this may negatively impact pool space efficiency. . .It Sy zfs_vdev_direct_write_verify Ns = Ns Sy Linux 1 | FreeBSD 0 Pq uint If non-zero, then a Direct I/O write's checksum will be verified every time the write is issued and before it is commited to the block pointer. In the event the checksum is not valid then the I/O operation will return EIO. This module parameter can be used to detect if the contents of the users buffer have changed in the process of doing a Direct I/O write. It can also help to identify if reported checksum errors are tied to Direct I/O writes. Each verify error causes a .Sy dio_verify zevent. Direct Write I/O checkum verify errors can be seen with .Nm zpool Cm status Fl d . The default value for this is 1 on Linux, but is 0 for .Fx because user pages can be placed under write protection in .Fx before the Direct I/O write is issued. . .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq uint Minimum ashift used when creating new top-level vdevs. . .It Sy zfs_vdev_min_ms_count Ns = Ns Sy 16 Pq uint Minimum number of metaslabs to create in a top-level vdev. . .It Sy vdev_validate_skip Ns = Ns Sy 0 Ns | Ns 1 Pq int Skip label validation steps during pool import. Changing is not recommended unless you know what you're doing and are recovering a damaged label. . .It Sy zfs_vdev_ms_count_limit Ns = Ns Sy 131072 Po 128k Pc Pq uint Practical upper limit of total metaslabs per top-level vdev. . .It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable metaslab group preloading. . .It Sy metaslab_preload_limit Ns = Ns Sy 10 Pq uint Maximum number of metaslabs per group to preload . .It Sy metaslab_preload_pct Ns = Ns Sy 50 Pq uint Percentage of CPUs to run a metaslab preload taskq . .It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Give more weight to metaslabs with lower LBAs, assuming they have greater bandwidth, as is typically the case on a modern constant angular velocity disk drive. . .It Sy metaslab_unload_delay Ns = Ns Sy 32 Pq uint After a metaslab is used, we keep it loaded for this many TXGs, to attempt to reduce unnecessary reloading. Note that both this many TXGs and .Sy metaslab_unload_delay_ms milliseconds must pass before unloading will occur. . .It Sy metaslab_unload_delay_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq uint After a metaslab is used, we keep it loaded for this many milliseconds, to attempt to reduce unnecessary reloading. Note, that both this many milliseconds and .Sy metaslab_unload_delay TXGs must pass before unloading will occur. . .It Sy reference_history Ns = Ns Sy 3 Pq uint Maximum reference holders being tracked when reference_tracking_enable is active. .It Sy raidz_expand_max_copy_bytes Ns = Ns Sy 160MB Pq ulong Max amount of memory to use for RAID-Z expansion I/O. This limits how much I/O can be outstanding at once. . .It Sy raidz_expand_max_reflow_bytes Ns = Ns Sy 0 Pq ulong For testing, pause RAID-Z expansion when reflow amount reaches this value. . .It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong For expanded RAID-Z, aggregate reads that have more rows than this. . .It Sy reference_history Ns = Ns Sy 3 Pq int Maximum reference holders being tracked when reference_tracking_enable is active. . .It Sy reference_tracking_enable Ns = Ns Sy 0 Ns | Ns 1 Pq int Track reference holders to .Sy refcount_t objects (debug builds only). . .It Sy send_holes_without_birth_time Ns = Ns Sy 1 Ns | Ns 0 Pq int When set, the .Sy hole_birth optimization will not be used, and all holes will always be sent during a .Nm zfs Cm send . This is useful if you suspect your datasets are affected by a bug in .Sy hole_birth . . .It Sy spa_config_path Ns = Ns Pa /etc/zfs/zpool.cache Pq charp SPA config file. . .It Sy spa_asize_inflation Ns = Ns Sy 24 Pq uint Multiplication factor used to estimate actual disk consumption from the size of data being written. The default value is a worst case estimate, but lower values may be valid for a given pool depending on its configuration. Pool administrators who understand the factors involved may wish to specify a more realistic inflation factor, particularly if they operate close to quota or capacity limits. . .It Sy spa_load_print_vdev_tree Ns = Ns Sy 0 Ns | Ns 1 Pq int Whether to print the vdev tree in the debugging message buffer during pool import. . .It Sy spa_load_verify_data Ns = Ns Sy 1 Ns | Ns 0 Pq int Whether to traverse data blocks during an "extreme rewind" .Pq Fl X import. .Pp An extreme rewind import normally performs a full traversal of all blocks in the pool for verification. If this parameter is unset, the traversal skips non-metadata blocks. It can be toggled once the import has started to stop or start the traversal of non-metadata blocks. . .It Sy spa_load_verify_metadata Ns = Ns Sy 1 Ns | Ns 0 Pq int Whether to traverse blocks during an "extreme rewind" .Pq Fl X pool import. .Pp An extreme rewind import normally performs a full traversal of all blocks in the pool for verification. If this parameter is unset, the traversal is not performed. It can be toggled once the import has started to stop or start the traversal. . .It Sy spa_load_verify_shift Ns = Ns Sy 4 Po 1/16th Pc Pq uint Sets the maximum number of bytes to consume during pool import to the log2 fraction of the target ARC size. . .It Sy spa_slop_shift Ns = Ns Sy 5 Po 1/32nd Pc Pq int Normally, we don't allow the last .Sy 3.2% Pq Sy 1/2^spa_slop_shift of space in the pool to be consumed. This ensures that we don't run the pool completely out of space, due to unaccounted changes (e.g. to the MOS). It also limits the worst-case time to allocate space. If we have less than this amount of free space, most ZPL operations (e.g. write, create) will return .Sy ENOSPC . . .It Sy spa_num_allocators Ns = Ns Sy 4 Pq int Determines the number of block alloctators to use per spa instance. Capped by the number of actual CPUs in the system via .Sy spa_cpus_per_allocator . .Pp Note that setting this value too high could result in performance degredation and/or excess fragmentation. Set value only applies to pools imported/created after that. . .It Sy spa_cpus_per_allocator Ns = Ns Sy 4 Pq int Determines the minimum number of CPUs in a system for block alloctator per spa instance. Set value only applies to pools imported/created after that. . .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint Limits the number of on-disk error log entries that will be converted to the new format when enabling the .Sy head_errlog feature. The default is to convert all log entries. . .It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint During top-level vdev removal, chunks of data are copied from the vdev which may include free space in order to trade bandwidth for IOPS. This parameter determines the maximum span of free space, in bytes, which will be included as "unnecessary" data in a chunk of copied data. .Pp The default value here was chosen to align with .Sy zfs_vdev_read_gap_limit , which is a similar concept when doing regular reads (but there's no reason it has to be the same). . .It Sy vdev_file_logical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64 Logical ashift for file-based devices. . .It Sy vdev_file_physical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64 Physical ashift for file-based devices. . .It Sy zap_iterate_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int If set, when we start iterating over a ZAP object, prefetch the entire object (all leaf blocks). However, this is limited by .Sy dmu_prefetch_max . . .It Sy zap_micro_max_size Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int Maximum micro ZAP size. -A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size. +A "micro" ZAP is upgraded to a "fat" ZAP once it grows beyond the specified +size. +Sizes higher than 128KiB will be clamped to 128KiB unless the +.Sy large_microzap +feature is enabled. . .It Sy zap_shrink_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int If set, adjacent empty ZAP blocks will be collapsed, reducing disk space. . .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint Min bytes to prefetch per stream. Prefetch distance starts from the demand access size and quickly grows to this value, doubling on each hit. After that it may grow further by 1/8 per hit, but only if some prefetch since last time haven't completed in time to satisfy demand request, i.e. prefetch depth didn't cover the read latency or the pool got saturated. . .It Sy zfetch_max_distance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint Max bytes to prefetch per stream. . .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint Max bytes to prefetch indirects for per stream. . .It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint Requests within this byte distance from the current prefetch stream position are considered parts of the stream, reordered due to parallel processing. Such requests do not advance the stream position immediately unless .Sy zfetch_hole_shift fill threshold is reached, but saved to fill holes in the stream later. . .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint Max number of streams per zfetch (prefetch streams per file). . .It Sy zfetch_min_sec_reap Ns = Ns Sy 1 Pq uint Min time before inactive prefetch stream can be reclaimed . .It Sy zfetch_max_sec_reap Ns = Ns Sy 2 Pq uint Max time before inactive prefetch stream can be deleted . .It Sy zfs_abd_scatter_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enables ARC from using scatter/gather lists and forces all allocations to be linear in kernel memory. Disabling can improve performance in some code paths at the expense of fragmented kernel memory. . .It Sy zfs_abd_scatter_max_order Ns = Ns Sy MAX_ORDER\-1 Pq uint Maximum number of consecutive memory pages allocated in a single block for scatter/gather lists. .Pp The value of .Sy MAX_ORDER depends on kernel configuration. . .It Sy zfs_abd_scatter_min_size Ns = Ns Sy 1536 Ns B Po 1.5 KiB Pc Pq uint This is the minimum allocation size that will use scatter (page-based) ABDs. Smaller allocations will use linear ABDs. . .It Sy zfs_arc_dnode_limit Ns = Ns Sy 0 Ns B Pq u64 When the number of bytes consumed by dnodes in the ARC exceeds this number of bytes, try to unpin some of it in response to demand for non-metadata. This value acts as a ceiling to the amount of dnode metadata, and defaults to .Sy 0 , which indicates that a percent which is based on .Sy zfs_arc_dnode_limit_percent of the ARC meta buffers that may be used for dnodes. .It Sy zfs_arc_dnode_limit_percent Ns = Ns Sy 10 Ns % Pq u64 Percentage that can be consumed by dnodes of ARC meta buffers. .Pp See also .Sy zfs_arc_dnode_limit , which serves a similar purpose but has a higher priority if nonzero. . .It Sy zfs_arc_dnode_reduce_percent Ns = Ns Sy 10 Ns % Pq u64 Percentage of ARC dnodes to try to scan in response to demand for non-metadata when the number of bytes consumed by dnodes exceeds .Sy zfs_arc_dnode_limit . . .It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq uint The ARC's buffer hash table is sized based on the assumption of an average block size of this value. This works out to roughly 1 MiB of hash table per 1 GiB of physical memory with 8-byte pointers. For configurations with a known larger average block size, this value can be increased to reduce the memory footprint. . .It Sy zfs_arc_eviction_pct Ns = Ns Sy 200 Ns % Pq uint When .Fn arc_is_overflowing , .Fn arc_get_data_impl waits for this percent of the requested amount of data to be evicted. For example, by default, for every .Em 2 KiB that's evicted, .Em 1 KiB of it may be "reused" by a new allocation. Since this is above .Sy 100 Ns % , it ensures that progress is made towards getting .Sy arc_size No under Sy arc_c . Since this is finite, it ensures that allocations can still happen, even during the potentially long time that .Sy arc_size No is more than Sy arc_c . . .It Sy zfs_arc_evict_batch_limit Ns = Ns Sy 10 Pq uint Number ARC headers to evict per sub-list before proceeding to another sub-list. This batch-style operation prevents entire sub-lists from being evicted at once but comes at a cost of additional unlocking and locking. . .It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint If set to a non zero value, it will replace the .Sy arc_grow_retry value with this value. The .Sy arc_grow_retry .No value Pq default Sy 5 Ns s is the number of seconds the ARC will wait before trying to resume growth after a memory pressure event. . .It Sy zfs_arc_lotsfree_percent Ns = Ns Sy 10 Ns % Pq int Throttle I/O when free system memory drops below this percentage of total system memory. Setting this value to .Sy 0 will disable the throttle. . .It Sy zfs_arc_max Ns = Ns Sy 0 Ns B Pq u64 Max size of ARC in bytes. If .Sy 0 , then the max size of ARC is determined by the amount of system memory installed. The larger of .Sy all_system_memory No \- Sy 1 GiB and .Sy 5/8 No \(mu Sy all_system_memory will be used as the limit. This value must be at least .Sy 67108864 Ns B Pq 64 MiB . .Pp This value can be changed dynamically, with some caveats. It cannot be set back to .Sy 0 while running, and reducing it below the current ARC size will not cause the ARC to shrink without memory pressure to induce shrinking. . .It Sy zfs_arc_meta_balance Ns = Ns Sy 500 Pq uint Balance between metadata and data on ghost hits. Values above 100 increase metadata caching by proportionally reducing effect of ghost data hits on target data/metadata rate. . .It Sy zfs_arc_min Ns = Ns Sy 0 Ns B Pq u64 Min size of ARC in bytes. .No If set to Sy 0 , arc_c_min will default to consuming the larger of .Sy 32 MiB and .Sy all_system_memory No / Sy 32 . . .It Sy zfs_arc_min_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 1s Pc Pq uint Minimum time prefetched blocks are locked in the ARC. . .It Sy zfs_arc_min_prescient_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 6s Pc Pq uint Minimum time "prescient prefetched" blocks are locked in the ARC. These blocks are meant to be prefetched fairly aggressively ahead of the code that may use them. . .It Sy zfs_arc_prune_task_threads Ns = Ns Sy 1 Pq int Number of arc_prune threads. .Fx does not need more than one. Linux may theoretically use one per mount point up to number of CPUs, but that was not proven to be useful. . .It Sy zfs_max_missing_tvds Ns = Ns Sy 0 Pq int Number of missing top-level vdevs which will be allowed during pool import (only in read-only mode). . .It Sy zfs_max_nvlist_src_size Ns = Sy 0 Pq u64 Maximum size in bytes allowed to be passed as .Sy zc_nvlist_src_size for ioctls on .Pa /dev/zfs . This prevents a user from causing the kernel to allocate an excessive amount of memory. When the limit is exceeded, the ioctl fails with .Sy EINVAL and a description of the error is sent to the .Pa zfs-dbgmsg log. This parameter should not need to be touched under normal circumstances. If .Sy 0 , equivalent to a quarter of the user-wired memory limit under .Fx and to .Sy 134217728 Ns B Pq 128 MiB under Linux. . .It Sy zfs_multilist_num_sublists Ns = Ns Sy 0 Pq uint To allow more fine-grained locking, each ARC state contains a series of lists for both data and metadata objects. Locking is performed at the level of these "sub-lists". This parameters controls the number of sub-lists per ARC state, and also applies to other uses of the multilist data structure. .Pp If .Sy 0 , equivalent to the greater of the number of online CPUs and .Sy 4 . . .It Sy zfs_arc_overflow_shift Ns = Ns Sy 8 Pq int The ARC size is considered to be overflowing if it exceeds the current ARC target size .Pq Sy arc_c by thresholds determined by this parameter. Exceeding by .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No / Sy 2 starts ARC reclamation process. If that appears insufficient, exceeding by .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No \(mu Sy 1.5 blocks new buffer allocation until the reclaim thread catches up. Started reclamation process continues till ARC size returns below the target size. .Pp The default value of .Sy 8 causes the ARC to start reclamation if it exceeds the target size by .Em 0.2% of the target size, and block allocations by .Em 0.6% . . .It Sy zfs_arc_shrink_shift Ns = Ns Sy 0 Pq uint If nonzero, this will update .Sy arc_shrink_shift Pq default Sy 7 with the new value. . .It Sy zfs_arc_pc_percent Ns = Ns Sy 0 Ns % Po off Pc Pq uint Percent of pagecache to reclaim ARC to. .Pp This tunable allows the ZFS ARC to play more nicely with the kernel's LRU pagecache. It can guarantee that the ARC size won't collapse under scanning pressure on the pagecache, yet still allows the ARC to be reclaimed down to .Sy zfs_arc_min if necessary. This value is specified as percent of pagecache size (as measured by .Sy NR_FILE_PAGES ) , where that percent may exceed .Sy 100 . This only operates during memory pressure/reclaim. . .It Sy zfs_arc_shrinker_limit Ns = Ns Sy 10000 Pq int This is a limit on how many pages the ARC shrinker makes available for eviction in response to one page allocation attempt. Note that in practice, the kernel's shrinker can ask us to evict up to about four times this for one allocation attempt. To reduce OOM risk, this limit is applied for kswapd reclaims only. .Pp The default limit of .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages limits the amount of time spent attempting to reclaim ARC memory to less than 100 ms per allocation attempt, even with a small average compressed block size of ~8 KiB. .Pp The parameter can be set to 0 (zero) to disable the limit, and only applies on Linux. . .It Sy zfs_arc_shrinker_seeks Ns = Ns Sy 2 Pq int Relative cost of ARC eviction on Linux, AKA number of seeks needed to restore evicted page. Bigger values make ARC more precious and evictions smaller, comparing to other kernel subsystems. Value of 4 means parity with page cache. . .It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq u64 The target number of bytes the ARC should leave as free memory on the system. If zero, equivalent to the bigger of .Sy 512 KiB No and Sy all_system_memory/64 . . .It Sy zfs_autoimport_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int Disable pool import at module load by ignoring the cache file .Pq Sy spa_config_path . . .It Sy zfs_checksum_events_per_second Ns = Ns Sy 20 Ns /s Pq uint Rate limit checksum events to this many per second. Note that this should not be set below the ZED thresholds (currently 10 checksums over 10 seconds) or else the daemon may not trigger any action. . .It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint This controls the amount of time that a ZIL block (lwb) will remain "open" when it isn't "full", and it has a thread waiting for it to be committed to stable storage. The timeout is scaled based on a percentage of the last lwb latency to avoid significantly impacting the latency of each individual transaction record (itx). . .It Sy zfs_condense_indirect_commit_entry_delay_ms Ns = Ns Sy 0 Ns ms Pq int Vdev indirection layer (used for device removal) sleeps for this many milliseconds during mapping generation. Intended for use with the test suite to throttle vdev removal speed. . .It Sy zfs_condense_indirect_obsolete_pct Ns = Ns Sy 25 Ns % Pq uint Minimum percent of obsolete bytes in vdev mapping required to attempt to condense .Pq see Sy zfs_condense_indirect_vdevs_enable . Intended for use with the test suite to facilitate triggering condensing as needed. . .It Sy zfs_condense_indirect_vdevs_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable condensing indirect vdev mappings. When set, attempt to condense indirect vdev mappings if the mapping uses more than .Sy zfs_condense_min_mapping_bytes bytes of memory and if the obsolete space map object uses more than .Sy zfs_condense_max_obsolete_bytes bytes on-disk. The condensing process is an attempt to save memory by removing obsolete mappings. . .It Sy zfs_condense_max_obsolete_bytes Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 Only attempt to condense indirect vdev mappings if the on-disk size of the obsolete space map object is greater than this number of bytes .Pq see Sy zfs_condense_indirect_vdevs_enable . . .It Sy zfs_condense_min_mapping_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq u64 Minimum size vdev mapping to attempt to condense .Pq see Sy zfs_condense_indirect_vdevs_enable . . .It Sy zfs_dbgmsg_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int Internally ZFS keeps a small log to facilitate debugging. The log is enabled by default, and can be disabled by unsetting this option. The contents of the log can be accessed by reading .Pa /proc/spl/kstat/zfs/dbgmsg . Writing .Sy 0 to the file clears the log. .Pp This setting does not influence debug prints due to .Sy zfs_flags . . .It Sy zfs_dbgmsg_maxsize Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint Maximum size of the internal ZFS debug log. . .It Sy zfs_dbuf_state_index Ns = Ns Sy 0 Pq int Historically used for controlling what reporting was available under .Pa /proc/spl/kstat/zfs . No effect. . .It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64 Check time in milliseconds. This defines the frequency at which we check for hung I/O requests and potentially invoke the .Sy zfs_deadman_failmode behavior. . .It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int When a pool sync operation takes longer than .Sy zfs_deadman_synctime_ms , or when an individual I/O operation takes longer than .Sy zfs_deadman_ziotime_ms , then the operation is considered to be "hung". If .Sy zfs_deadman_enabled is set, then the deadman behavior is invoked as described by .Sy zfs_deadman_failmode . By default, the deadman is enabled and set to .Sy wait which results in "hung" I/O operations only being logged. The deadman is automatically disabled when a pool gets suspended. . .It Sy zfs_deadman_events_per_second Ns = Ns Sy 1 Ns /s Pq int Rate limit deadman zevents (which report hung I/O operations) to this many per second. . .It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp Controls the failure behavior when the deadman detects a "hung" I/O operation. Valid values are: .Bl -tag -compact -offset 4n -width "continue" .It Sy wait Wait for a "hung" operation to complete. For each "hung" operation a "deadman" event will be posted describing that operation. .It Sy continue Attempt to recover from a "hung" operation by re-dispatching it to the I/O pipeline if possible. .It Sy panic Panic the system. This can be used to facilitate automatic fail-over to a properly configured fail-over partner. .El . .It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq u64 Interval in milliseconds after which the deadman is triggered and also the interval after which a pool sync operation is considered to be "hung". Once this limit is exceeded the deadman will be invoked every .Sy zfs_deadman_checktime_ms milliseconds until the pool sync completes. . .It Sy zfs_deadman_ziotime_ms Ns = Ns Sy 300000 Ns ms Po 5 min Pc Pq u64 Interval in milliseconds after which the deadman is triggered and an individual I/O operation is considered to be "hung". As long as the operation remains "hung", the deadman will be invoked every .Sy zfs_deadman_checktime_ms milliseconds until the operation completes. . .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int Enable prefetching dedup-ed blocks which are going to be freed. . .It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint Maximum number of dedup log flush passes (iterations) each transaction. .Pp At the start of each transaction, OpenZFS will estimate how many entries it needs to flush out to keep up with the change rate, taking the amount and time taken to flush on previous txgs into account (see .Sy zfs_dedup_log_flush_flow_rate_txgs ) . It will spread this amount into a number of passes. At each pass, it will use the amount already flushed and the total time taken by flushing and by other IO to recompute how much it should do for the remainder of the txg. .Pp Reducing the max number of passes will make flushing more aggressive, flushing out more entries on each pass. This can be faster, but also more likely to compete with other IO. Increasing the max number of passes will put fewer entries onto each pass, keeping the overhead of dedup changes to a minimum but possibly causing a large number of changes to be dumped on the last pass, which can blow out the txg sync time beyond .Sy zfs_txg_timeout . . .It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint Minimum time to spend on dedup log flush each transaction. .Pp At least this long will be spent flushing dedup log entries each transaction, up to .Sy zfs_txg_timeout . This occurs even if doing so would delay the transaction, that is, other IO completes under this time. . .It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint Flush at least this many entries each transaction. .Pp OpenZFS will estimate how many entries it needs to flush each transaction to keep up with the ingest rate (see .Sy zfs_dedup_log_flush_flow_rate_txgs ) . This sets the minimum for that estimate. Raising it can force OpenZFS to flush more aggressively, keeping the log small and so reducing pool import times, but can make it less able to back off if log flushing would compete with other IO too much. . .It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint Number of transactions to use to compute the flow rate. .Pp OpenZFS will estimate how many entries it needs to flush each transaction by monitoring the number of entries changed (ingest rate), number of entries flushed (flush rate) and time spent flushing (flush time rate) and combining these into an overall "flow rate". It will use an exponential weighted moving average over some number of recent transactions to compute these rates. This sets the number of transactions to compute these averages over. Setting it higher can help to smooth out the flow rate in the face of spiky workloads, but will take longer for the flow rate to adjust to a sustained change in the ingress rate. . .It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint Max transactions to before starting to flush dedup logs. .Pp OpenZFS maintains two dedup logs, one receiving new changes, one flushing. If there is nothing to flush, it will accumulate changes for no more than this many transactions before switching the logs and starting to flush entries out. . .It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64 Max memory to use for dedup logs. .Pp OpenZFS will spend no more than this much memory on maintaining the in-memory dedup log. Flushing will begin when around half this amount is being spent on logs. The default value of .Sy 0 will cause it to be set by .Sy zfs_dedup_log_mem_max_percent instead. . .It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint Max memory to use for dedup logs, as a percentage of total memory. .Pp If .Sy zfs_dedup_log_mem_max is not set, it will be initialised as a percentage of the total memory in the system. . .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint Start to delay each transaction once there is this amount of dirty data, expressed as a percentage of .Sy zfs_dirty_data_max . This value should be at least .Sy zfs_vdev_async_write_active_max_dirty_percent . .No See Sx ZFS TRANSACTION DELAY . . .It Sy zfs_delay_scale Ns = Ns Sy 500000 Pq int This controls how quickly the transaction delay approaches infinity. Larger values cause longer delays for a given amount of dirty data. .Pp For the smoothest delay, this value should be about 1 billion divided by the maximum number of operations per second. This will smoothly handle between ten times and a tenth of this number. .No See Sx ZFS TRANSACTION DELAY . .Pp .Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 . . .It Sy zfs_dio_write_verify_events_per_second Ns = Ns Sy 20 Ns /s Pq uint Rate limit Direct I/O write verify events to this many per second. . .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int Disables requirement for IVset GUIDs to be present and match when doing a raw receive of encrypted datasets. Intended for users whose pools were created with OpenZFS pre-release versions and now have compatibility issues. . .It Sy zfs_key_max_salt_uses Ns = Ns Sy 400000000 Po 4*10^8 Pc Pq ulong Maximum number of uses of a single salt value before generating a new one for encrypted datasets. The default value is also the maximum. . .It Sy zfs_object_mutex_size Ns = Ns Sy 64 Pq uint Size of the znode hashtable used for holds. .Pp Due to the need to hold locks on objects that may not exist yet, kernel mutexes are not created per-object and instead a hashtable is used where collisions will result in objects waiting when there is not actually contention on the same object. . .It Sy zfs_slow_io_events_per_second Ns = Ns Sy 20 Ns /s Pq int Rate limit delay zevents (which report slow I/O operations) to this many per second. . .It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 Upper-bound limit for unflushed metadata changes to be held by the log spacemap in memory, in bytes. . .It Sy zfs_unflushed_max_mem_ppm Ns = Ns Sy 1000 Ns ppm Po 0.1% Pc Pq u64 Part of overall system memory that ZFS allows to be used for unflushed metadata changes by the log spacemap, in millionths. . .It Sy zfs_unflushed_log_block_max Ns = Ns Sy 131072 Po 128k Pc Pq u64 Describes the maximum number of log spacemap blocks allowed for each pool. The default value means that the space in all the log spacemaps can add up to no more than .Sy 131072 blocks (which means .Em 16 GiB of logical space before compression and ditto blocks, assuming that blocksize is .Em 128 KiB ) . .Pp This tunable is important because it involves a trade-off between import time after an unclean export and the frequency of flushing metaslabs. The higher this number is, the more log blocks we allow when the pool is active which means that we flush metaslabs less often and thus decrease the number of I/O operations for spacemap updates per TXG. At the same time though, that means that in the event of an unclean export, there will be more log spacemap blocks for us to read, inducing overhead in the import time of the pool. The lower the number, the amount of flushing increases, destroying log blocks quicker as they become obsolete faster, which leaves less blocks to be read during import time after a crash. .Pp Each log spacemap block existing during pool import leads to approximately one extra logical I/O issued. This is the reason why this tunable is exposed in terms of blocks rather than space used. . .It Sy zfs_unflushed_log_block_min Ns = Ns Sy 1000 Pq u64 If the number of metaslabs is small and our incoming rate is high, we could get into a situation that we are flushing all our metaslabs every TXG. Thus we always allow at least this many log blocks. . .It Sy zfs_unflushed_log_block_pct Ns = Ns Sy 400 Ns % Pq u64 Tunable used to determine the number of blocks that can be used for the spacemap log, expressed as a percentage of the total number of unflushed metaslabs in the pool. . .It Sy zfs_unflushed_log_txg_max Ns = Ns Sy 1000 Pq u64 Tunable limiting maximum time in TXGs any metaslab may remain unflushed. It effectively limits maximum number of unflushed per-TXG spacemap logs that need to be read after unclean pool export. . .It Sy zfs_unlink_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint When enabled, files will not be asynchronously removed from the list of pending unlinks and the space they consume will be leaked. Once this option has been disabled and the dataset is remounted, the pending unlinks will be processed and the freed space returned to the pool. This option is used by the test suite. . .It Sy zfs_delete_blocks Ns = Ns Sy 20480 Pq ulong This is the used to define a large file for the purposes of deletion. Files containing more than .Sy zfs_delete_blocks will be deleted asynchronously, while smaller files are deleted synchronously. Decreasing this value will reduce the time spent in an .Xr unlink 2 system call, at the expense of a longer delay before the freed space is available. This only applies on Linux. . .It Sy zfs_dirty_data_max Ns = Pq int Determines the dirty space limit in bytes. Once this limit is exceeded, new writes are halted until space frees up. This parameter takes precedence over .Sy zfs_dirty_data_max_percent . .No See Sx ZFS TRANSACTION DELAY . .Pp Defaults to .Sy physical_ram/10 , capped at .Sy zfs_dirty_data_max_max . . .It Sy zfs_dirty_data_max_max Ns = Pq int Maximum allowable value of .Sy zfs_dirty_data_max , expressed in bytes. This limit is only enforced at module load time, and will be ignored if .Sy zfs_dirty_data_max is later changed. This parameter takes precedence over .Sy zfs_dirty_data_max_max_percent . .No See Sx ZFS TRANSACTION DELAY . .Pp Defaults to .Sy min(physical_ram/4, 4GiB) , or .Sy min(physical_ram/4, 1GiB) for 32-bit systems. . .It Sy zfs_dirty_data_max_max_percent Ns = Ns Sy 25 Ns % Pq uint Maximum allowable value of .Sy zfs_dirty_data_max , expressed as a percentage of physical RAM. This limit is only enforced at module load time, and will be ignored if .Sy zfs_dirty_data_max is later changed. The parameter .Sy zfs_dirty_data_max_max takes precedence over this one. .No See Sx ZFS TRANSACTION DELAY . . .It Sy zfs_dirty_data_max_percent Ns = Ns Sy 10 Ns % Pq uint Determines the dirty space limit, expressed as a percentage of all memory. Once this limit is exceeded, new writes are halted until space frees up. The parameter .Sy zfs_dirty_data_max takes precedence over this one. .No See Sx ZFS TRANSACTION DELAY . .Pp Subject to .Sy zfs_dirty_data_max_max . . .It Sy zfs_dirty_data_sync_percent Ns = Ns Sy 20 Ns % Pq uint Start syncing out a transaction group if there's at least this much dirty data .Pq as a percentage of Sy zfs_dirty_data_max . This should be less than .Sy zfs_vdev_async_write_active_min_dirty_percent . . .It Sy zfs_wrlog_data_max Ns = Pq int The upper limit of write-transaction zil log data size in bytes. Write operations are throttled when approaching the limit until log data is cleared out after transaction group sync. Because of some overhead, it should be set at least 2 times the size of .Sy zfs_dirty_data_max .No to prevent harming normal write throughput . It also should be smaller than the size of the slog device if slog is present. .Pp Defaults to .Sy zfs_dirty_data_max*2 . .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be preallocated for a file in order to guarantee that later writes will not run out of space. Instead, .Xr fallocate 2 space preallocation only checks that sufficient space is currently available in the pool or the user's project quota allocation, and then creates a sparse file of the requested size. The requested space is multiplied by .Sy zfs_fallocate_reserve_percent to allow additional space for indirect blocks and other internal metadata. Setting this to .Sy 0 disables support for .Xr fallocate 2 and causes it to return .Sy EOPNOTSUPP . . .It Sy zfs_fletcher_4_impl Ns = Ns Sy fastest Pq string Select a fletcher 4 implementation. .Pp Supported selectors are: .Sy fastest , scalar , sse2 , ssse3 , avx2 , avx512f , avx512bw , .No and Sy aarch64_neon . All except .Sy fastest No and Sy scalar require instruction set extensions to be available, and will only appear if ZFS detects that they are present at runtime. If multiple implementations of fletcher 4 are available, the .Sy fastest will be chosen using a micro benchmark. Selecting .Sy scalar results in the original CPU-based calculation being used. Selecting any option other than .Sy fastest No or Sy scalar results in vector instructions from the respective CPU instruction set being used. . .It Sy zfs_bclone_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable the experimental block cloning feature. If this setting is 0, then even if feature@block_cloning is enabled, attempts to clone blocks will act as though the feature is disabled. . .It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be written to disk. This allows the clone operation to reliably succeed when a file is modified and then immediately cloned. For small files this may be slower than making a copy of the file. Therefore, this setting defaults to 0 which causes a clone operation to immediately fail when encountering a dirty block. . .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string Select a BLAKE3 implementation. .Pp Supported selectors are: .Sy cycle , fastest , generic , sse2 , sse41 , avx2 , avx512 . All except .Sy cycle , fastest No and Sy generic require instruction set extensions to be available, and will only appear if ZFS detects that they are present at runtime. If multiple implementations of BLAKE3 are available, the .Sy fastest will be chosen using a micro benchmark. You can see the benchmark results by reading this kstat file: .Pa /proc/spl/kstat/zfs/chksum_bench . . .It Sy zfs_free_bpobj_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable/disable the processing of the free_bpobj object. . .It Sy zfs_async_block_max_blocks Ns = Ns Sy UINT64_MAX Po unlimited Pc Pq u64 Maximum number of blocks freed in a single TXG. . .It Sy zfs_max_async_dedup_frees Ns = Ns Sy 100000 Po 10^5 Pc Pq u64 Maximum number of dedup blocks freed in a single TXG. . .It Sy zfs_vdev_async_read_max_active Ns = Ns Sy 3 Pq uint Maximum asynchronous read I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_read_min_active Ns = Ns Sy 1 Pq uint Minimum asynchronous read I/O operation active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_active_max_dirty_percent Ns = Ns Sy 60 Ns % Pq uint When the pool has more than this much dirty data, use .Sy zfs_vdev_async_write_max_active to limit active async writes. If the dirty data is between the minimum and maximum, the active I/O limit is linearly interpolated. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_active_min_dirty_percent Ns = Ns Sy 30 Ns % Pq uint When the pool has less than this much dirty data, use .Sy zfs_vdev_async_write_min_active to limit active async writes. If the dirty data is between the minimum and maximum, the active I/O limit is linearly interpolated. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_max_active Ns = Ns Sy 10 Pq uint Maximum asynchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_async_write_min_active Ns = Ns Sy 2 Pq uint Minimum asynchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . .Pp Lower values are associated with better latency on rotational media but poorer resilver performance. The default value of .Sy 2 was chosen as a compromise. A value of .Sy 3 has been shown to improve resilver performance further at a cost of further increasing latency. . .It Sy zfs_vdev_initializing_max_active Ns = Ns Sy 1 Pq uint Maximum initializing I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_initializing_min_active Ns = Ns Sy 1 Pq uint Minimum initializing I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_max_active Ns = Ns Sy 1000 Pq uint The maximum number of I/O operations active to each device. Ideally, this will be at least the sum of each queue's .Sy max_active . .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_open_timeout_ms Ns = Ns Sy 1000 Pq uint Timeout value to wait before determining a device is missing during import. This is helpful for transient missing paths due to links being briefly removed and recreated in response to udev events. . .It Sy zfs_vdev_rebuild_max_active Ns = Ns Sy 3 Pq uint Maximum sequential resilver I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_rebuild_min_active Ns = Ns Sy 1 Pq uint Minimum sequential resilver I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_removal_max_active Ns = Ns Sy 2 Pq uint Maximum removal I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_removal_min_active Ns = Ns Sy 1 Pq uint Minimum removal I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_scrub_max_active Ns = Ns Sy 2 Pq uint Maximum scrub I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_scrub_min_active Ns = Ns Sy 1 Pq uint Minimum scrub I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_read_max_active Ns = Ns Sy 10 Pq uint Maximum synchronous read I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_read_min_active Ns = Ns Sy 10 Pq uint Minimum synchronous read I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_write_max_active Ns = Ns Sy 10 Pq uint Maximum synchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_sync_write_min_active Ns = Ns Sy 10 Pq uint Minimum synchronous write I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_trim_max_active Ns = Ns Sy 2 Pq uint Maximum trim/discard I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_trim_min_active Ns = Ns Sy 1 Pq uint Minimum trim/discard I/O operations active to each device. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_nia_delay Ns = Ns Sy 5 Pq uint For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), the number of concurrently-active I/O operations is limited to .Sy zfs_*_min_active , unless the vdev is "idle". When there are no interactive I/O operations active (synchronous or otherwise), and .Sy zfs_vdev_nia_delay operations have completed since the last interactive operation, then the vdev is considered to be "idle", and the number of concurrently-active non-interactive operations is increased to .Sy zfs_*_max_active . .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_nia_credit Ns = Ns Sy 5 Pq uint Some HDDs tend to prioritize sequential I/O so strongly, that concurrent random I/O latency reaches several seconds. On some HDDs this happens even if sequential I/O operations are submitted one at a time, and so setting .Sy zfs_*_max_active Ns = Sy 1 does not help. To prevent non-interactive I/O, like scrub, from monopolizing the device, no more than .Sy zfs_vdev_nia_credit operations can be sent while there are outstanding incomplete interactive operations. This enforced wait ensures the HDD services the interactive I/O within a reasonable amount of time. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_queue_depth_pct Ns = Ns Sy 1000 Ns % Pq uint Maximum number of queued allocations per top-level vdev expressed as a percentage of .Sy zfs_vdev_async_write_max_active , which allows the system to detect devices that are more capable of handling allocations and to allocate more blocks to those devices. This allows for dynamic allocation distribution when devices are imbalanced, as fuller devices will tend to be slower than empty devices. .Pp Also see .Sy zio_dva_throttle_enabled . . .It Sy zfs_vdev_def_queue_depth Ns = Ns Sy 32 Pq uint Default queue depth for each vdev IO allocator. Higher values allow for better coalescing of sequential writes before sending them to the disk, but can increase transaction commit times. . .It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint Defines if the driver should retire on a given error type. The following options may be bitwise-ored together: .TS box; lbz r l l . Value Name Description _ 1 Device No driver retries on device errors 2 Transport No driver retries on transport errors. 4 Driver No driver retries on driver errors. .TE . .It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint Maximum number of segments to add to a BIO (min 4). If this is higher than the maximum allowed by the device queue or the kernel itself, it will be clamped. Setting it to zero will cause the kernel's ideal size to be used. This parameter only applies on Linux. This parameter is ignored if .Sy zfs_vdev_disk_classic Ns = Ns Sy 1 . . .It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2 and earlier. This "classic" method has known issues with highly fragmented IO requests and is slower on many workloads, but it has been in use for many years and is known to be very stable. If you set this parameter, please also open a bug report why you did so, including the workload involved and any error messages. .Pp This parameter and the classic submission method will be removed once we have total confidence in the new method. .Pp This parameter only applies on Linux, and can only be set at module load time. . .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int Time before expiring .Pa .zfs/snapshot . . .It Sy zfs_admin_snapshot Ns = Ns Sy 0 Ns | Ns 1 Pq int Allow the creation, removal, or renaming of entries in the .Sy .zfs/snapshot directory to cause the creation, destruction, or renaming of snapshots. When enabled, this functionality works both locally and over NFS exports which have the .Em no_root_squash option set. . .It Sy zfs_snapshot_no_setuid Ns = Ns Sy 0 Ns | Ns 1 Pq int Whether to disable .Em setuid/setgid support for snapshot mounts triggered by access to the .Sy .zfs/snapshot directory by setting the .Em nosuid mount option. . .It Sy zfs_flags Ns = Ns Sy 0 Pq int Set additional debugging flags. The following flags may be bitwise-ored together: .TS box; lbz r l l . Value Name Description _ 1 ZFS_DEBUG_DPRINTF Enable dprintf entries in the debug log. * 2 ZFS_DEBUG_DBUF_VERIFY Enable extra dbuf verifications. * 4 ZFS_DEBUG_DNODE_VERIFY Enable extra dnode verifications. 8 ZFS_DEBUG_SNAPNAMES Enable snapshot name verification. * 16 ZFS_DEBUG_MODIFY Check for illegally modified ARC buffers. 64 ZFS_DEBUG_ZIO_FREE Enable verification of block frees. 128 ZFS_DEBUG_HISTOGRAM_VERIFY Enable extra spacemap histogram verifications. 256 ZFS_DEBUG_METASLAB_VERIFY Verify space accounting on disk matches in-memory \fBrange_trees\fP. 512 ZFS_DEBUG_SET_ERROR Enable \fBSET_ERROR\fP and dprintf entries in the debug log. 1024 ZFS_DEBUG_INDIRECT_REMAP Verify split blocks created by device removal. 2048 ZFS_DEBUG_TRIM Verify TRIM ranges are always within the allocatable range tree. 4096 ZFS_DEBUG_LOG_SPACEMAP Verify that the log summary is consistent with the spacemap log and enable \fBzfs_dbgmsgs\fP for metaslab loading and flushing. .TE .Sy \& * No Requires debug build . . .It Sy zfs_btree_verify_intensity Ns = Ns Sy 0 Pq uint Enables btree verification. The following settings are culminative: .TS box; lbz r l l . Value Description 1 Verify height. 2 Verify pointers from children to parent. 3 Verify element counts. 4 Verify element order. (expensive) * 5 Verify unused memory is poisoned. (expensive) .TE .Sy \& * No Requires debug build . . .It Sy zfs_free_leak_on_eio Ns = Ns Sy 0 Ns | Ns 1 Pq int If destroy encounters an .Sy EIO while reading metadata (e.g. indirect blocks), space referenced by the missing metadata can not be freed. Normally this causes the background destroy to become "stalled", as it is unable to make forward progress. While in this stalled state, all remaining space to free from the error-encountering filesystem is "temporarily leaked". Set this flag to cause it to ignore the .Sy EIO , permanently leak the space from indirect blocks that can not be read, and continue to free everything else that it can. .Pp The default "stalling" behavior is useful if the storage partially fails (i.e. some but not all I/O operations fail), and then later recovers. In this case, we will be able to continue pool operations while it is partially failed, and when it recovers, we can continue to free the space, with no leaks. Note, however, that this case is actually fairly rare. .Pp Typically pools either .Bl -enum -compact -offset 4n -width "1." .It fail completely (but perhaps temporarily, e.g. due to a top-level vdev going offline), or .It have localized, permanent errors (e.g. disk returns the wrong data due to bit flip or firmware bug). .El In the former case, this setting does not matter because the pool will be suspended and the sync thread will not be able to make forward progress regardless. In the latter, because the error is permanent, the best we can do is leak the minimum amount of space, which is what setting this flag will do. It is therefore reasonable for this flag to normally be set, but we chose the more conservative approach of not setting it, so that there is no possibility of leaking space in the "partial temporary" failure case. . .It Sy zfs_free_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq uint During a .Nm zfs Cm destroy operation using the .Sy async_destroy feature, a minimum of this much time will be spent working on freeing blocks per TXG. . .It Sy zfs_obsolete_min_time_ms Ns = Ns Sy 500 Ns ms Pq uint Similar to .Sy zfs_free_min_time_ms , but for cleanup of old indirection records for removed vdevs. . .It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq s64 Largest data block to write to the ZIL. Larger blocks will be treated as if the dataset being written to had the .Sy logbias Ns = Ns Sy throughput property set. . .It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq u64 Pattern written to vdev free space by .Xr zpool-initialize 8 . . .It Sy zfs_initialize_chunk_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Size of writes used by .Xr zpool-initialize 8 . This option is used by the test suite. . .It Sy zfs_livelist_max_entries Ns = Ns Sy 500000 Po 5*10^5 Pc Pq u64 The threshold size (in block pointers) at which we create a new sub-livelist. Larger sublists are more costly from a memory perspective but the fewer sublists there are, the lower the cost of insertion. . .It Sy zfs_livelist_min_percent_shared Ns = Ns Sy 75 Ns % Pq int If the amount of shared space between a snapshot and its clone drops below this threshold, the clone turns off the livelist and reverts to the old deletion method. This is in place because livelists no long give us a benefit once a clone has been overwritten enough. . .It Sy zfs_livelist_condense_new_alloc Ns = Ns Sy 0 Pq int Incremented each time an extra ALLOC blkptr is added to a livelist entry while it is being condensed. This option is used by the test suite to track race conditions. . .It Sy zfs_livelist_condense_sync_cancel Ns = Ns Sy 0 Pq int Incremented each time livelist condensing is canceled while in .Fn spa_livelist_condense_sync . This option is used by the test suite to track race conditions. . .It Sy zfs_livelist_condense_sync_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int When set, the livelist condense process pauses indefinitely before executing the synctask \(em .Fn spa_livelist_condense_sync . This option is used by the test suite to trigger race conditions. . .It Sy zfs_livelist_condense_zthr_cancel Ns = Ns Sy 0 Pq int Incremented each time livelist condensing is canceled while in .Fn spa_livelist_condense_cb . This option is used by the test suite to track race conditions. . .It Sy zfs_livelist_condense_zthr_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int When set, the livelist condense process pauses indefinitely before executing the open context condensing work in .Fn spa_livelist_condense_cb . This option is used by the test suite to trigger race conditions. . .It Sy zfs_lua_max_instrlimit Ns = Ns Sy 100000000 Po 10^8 Pc Pq u64 The maximum execution time limit that can be set for a ZFS channel program, specified as a number of Lua instructions. . .It Sy zfs_lua_max_memlimit Ns = Ns Sy 104857600 Po 100 MiB Pc Pq u64 The maximum memory limit that can be set for a ZFS channel program, specified in bytes. . .It Sy zfs_max_dataset_nesting Ns = Ns Sy 50 Pq int The maximum depth of nested datasets. This value can be tuned temporarily to fix existing datasets that exceed the predefined limit. . .It Sy zfs_max_log_walking Ns = Ns Sy 5 Pq u64 The number of past TXGs that the flushing algorithm of the log spacemap feature uses to estimate incoming log blocks. . .It Sy zfs_max_logsm_summary_length Ns = Ns Sy 10 Pq u64 Maximum number of rows allowed in the summary of the spacemap log. . .It Sy zfs_max_recordsize Ns = Ns Sy 16777216 Po 16 MiB Pc Pq uint We currently support block sizes from .Em 512 Po 512 B Pc No to Em 16777216 Po 16 MiB Pc . The benefits of larger blocks, and thus larger I/O, need to be weighed against the cost of COWing a giant block to modify one byte. Additionally, very large blocks can have an impact on I/O latency, and also potentially on the memory allocator. Therefore, we formerly forbade creating blocks larger than 1M. Larger blocks could be created by changing it, and pools with larger blocks can always be imported and used, regardless of this setting. .Pp Note that it is still limited by default to .Ar 1 MiB on x86_32, because Linux's 3/1 memory split doesn't leave much room for 16M chunks. . .It Sy zfs_allow_redacted_dataset_mount Ns = Ns Sy 0 Ns | Ns 1 Pq int Allow datasets received with redacted send/receive to be mounted. Normally disabled because these datasets may be missing key data. . .It Sy zfs_min_metaslabs_to_flush Ns = Ns Sy 1 Pq u64 Minimum number of metaslabs to flush per dirty TXG. . .It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 70 Ns % Pq uint Allow metaslabs to keep their active state as long as their fragmentation percentage is no more than this value. An active metaslab that exceeds this threshold will no longer keep its active status allowing better metaslabs to be selected. . .It Sy zfs_mg_fragmentation_threshold Ns = Ns Sy 95 Ns % Pq uint Metaslab groups are considered eligible for allocations if their fragmentation metric (measured as a percentage) is less than or equal to this value. If a metaslab group exceeds this threshold then it will be skipped unless all metaslab groups within the metaslab class have also crossed this threshold. . .It Sy zfs_mg_noalloc_threshold Ns = Ns Sy 0 Ns % Pq uint Defines a threshold at which metaslab groups should be eligible for allocations. The value is expressed as a percentage of free space beyond which a metaslab group is always eligible for allocations. If a metaslab group's free space is less than or equal to the threshold, the allocator will avoid allocating to that group unless all groups in the pool have reached the threshold. Once all groups have reached the threshold, all groups are allowed to accept allocations. The default value of .Sy 0 disables the feature and causes all metaslab groups to be eligible for allocations. .Pp This parameter allows one to deal with pools having heavily imbalanced vdevs such as would be the case when a new vdev has been added. Setting the threshold to a non-zero percentage will stop allocations from being made to vdevs that aren't filled to the specified percentage and allow lesser filled vdevs to acquire more allocations than they otherwise would under the old .Sy zfs_mg_alloc_failures facility. . .It Sy zfs_ddt_data_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int If enabled, ZFS will place DDT data into the special allocation class. . .It Sy zfs_user_indirect_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int If enabled, ZFS will place user data indirect blocks into the special allocation class. . .It Sy zfs_multihost_history Ns = Ns Sy 0 Pq uint Historical statistics for this many latest multihost updates will be available in .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /multihost . . .It Sy zfs_multihost_interval Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq u64 Used to control the frequency of multihost writes which are performed when the .Sy multihost pool property is on. This is one of the factors used to determine the length of the activity check during import. .Pp The multihost write period is .Sy zfs_multihost_interval No / Sy leaf-vdevs . On average a multihost write will be issued for each leaf vdev every .Sy zfs_multihost_interval milliseconds. In practice, the observed period can vary with the I/O load and this observed value is the delay which is stored in the uberblock. . .It Sy zfs_multihost_import_intervals Ns = Ns Sy 20 Pq uint Used to control the duration of the activity test on import. Smaller values of .Sy zfs_multihost_import_intervals will reduce the import time but increase the risk of failing to detect an active pool. The total activity check time is never allowed to drop below one second. .Pp On import the activity check waits a minimum amount of time determined by .Sy zfs_multihost_interval No \(mu Sy zfs_multihost_import_intervals , or the same product computed on the host which last had the pool imported, whichever is greater. The activity check time may be further extended if the value of MMP delay found in the best uberblock indicates actual multihost updates happened at longer intervals than .Sy zfs_multihost_interval . A minimum of .Em 100 ms is enforced. .Pp .Sy 0 No is equivalent to Sy 1 . . .It Sy zfs_multihost_fail_intervals Ns = Ns Sy 10 Pq uint Controls the behavior of the pool when multihost write failures or delays are detected. .Pp When .Sy 0 , multihost write failures or delays are ignored. The failures will still be reported to the ZED which depending on its configuration may take action such as suspending the pool or offlining a device. .Pp Otherwise, the pool will be suspended if .Sy zfs_multihost_fail_intervals No \(mu Sy zfs_multihost_interval milliseconds pass without a successful MMP write. This guarantees the activity test will see MMP writes if the pool is imported. .Sy 1 No is equivalent to Sy 2 ; this is necessary to prevent the pool from being suspended due to normal, small I/O latency variations. . .It Sy zfs_no_scrub_io Ns = Ns Sy 0 Ns | Ns 1 Pq int Set to disable scrub I/O. This results in scrubs not actually scrubbing data and simply doing a metadata crawl of the pool instead. . .It Sy zfs_no_scrub_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int Set to disable block prefetching for scrubs. . .It Sy zfs_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable cache flush operations on disks when writing. Setting this will cause pool corruption on power loss if a volatile out-of-order write cache is enabled. . .It Sy zfs_nopwrite_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Allow no-operation writes. The occurrence of nopwrites will further depend on other pool properties .Pq i.a. the checksumming and compression algorithms . . .It Sy zfs_dmu_offset_next_sync Ns = Ns Sy 1 Ns | Ns 0 Pq int Enable forcing TXG sync to find holes. When enabled forces ZFS to sync data when .Sy SEEK_HOLE No or Sy SEEK_DATA flags are used allowing holes in a file to be accurately reported. When disabled holes will not be reported in recently dirtied files. . .It Sy zfs_pd_bytes_max Ns = Ns Sy 52428800 Ns B Po 50 MiB Pc Pq int The number of bytes which should be prefetched during a pool traversal, like .Nm zfs Cm send or other data crawling operations. . .It Sy zfs_traverse_indirect_prefetch_limit Ns = Ns Sy 32 Pq uint The number of blocks pointed by indirect (non-L0) block which should be prefetched during a pool traversal, like .Nm zfs Cm send or other data crawling operations. . .It Sy zfs_per_txg_dirty_frees_percent Ns = Ns Sy 30 Ns % Pq u64 Control percentage of dirtied indirect blocks from frees allowed into one TXG. After this threshold is crossed, additional frees will wait until the next TXG. .Sy 0 No disables this throttle . . .It Sy zfs_prefetch_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable predictive prefetch. Note that it leaves "prescient" prefetch .Pq for, e.g., Nm zfs Cm send intact. Unlike predictive prefetch, prescient prefetch never issues I/O that ends up not being needed, so it can't hurt performance. . .It Sy zfs_qat_checksum_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable QAT hardware acceleration for SHA256 checksums. May be unset after the ZFS modules have been loaded to initialize the QAT hardware as long as support is compiled in and the QAT driver is present. . .It Sy zfs_qat_compress_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable QAT hardware acceleration for gzip compression. May be unset after the ZFS modules have been loaded to initialize the QAT hardware as long as support is compiled in and the QAT driver is present. . .It Sy zfs_qat_encrypt_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable QAT hardware acceleration for AES-GCM encryption. May be unset after the ZFS modules have been loaded to initialize the QAT hardware as long as support is compiled in and the QAT driver is present. . .It Sy zfs_vnops_read_chunk_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Bytes to read per chunk. . .It Sy zfs_read_history Ns = Ns Sy 0 Pq uint Historical statistics for this many latest reads will be available in .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /reads . . .It Sy zfs_read_history_hits Ns = Ns Sy 0 Ns | Ns 1 Pq int Include cache hits in read history . .It Sy zfs_rebuild_max_segment Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 Maximum read segment size to issue when sequentially resilvering a top-level vdev. . .It Sy zfs_rebuild_scrub_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Automatically start a pool scrub when the last active sequential resilver completes in order to verify the checksums of all blocks which have been resilvered. This is enabled by default and strongly recommended. . .It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64 Maximum amount of I/O that can be concurrently issued for a sequential resilver per leaf device, given in bytes. . .It Sy zfs_reconstruct_indirect_combinations_max Ns = Ns Sy 4096 Pq int If an indirect split block contains more than this many possible unique combinations when being reconstructed, consider it too computationally expensive to check them all. Instead, try at most this many randomly selected combinations each time the block is accessed. This allows all segment copies to participate fairly in the reconstruction when all combinations cannot be checked and prevents repeated use of one bad copy. . .It Sy zfs_recover Ns = Ns Sy 0 Ns | Ns 1 Pq int Set to attempt to recover from fatal errors. This should only be used as a last resort, as it typically results in leaked space, or worse. . .It Sy zfs_removal_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int Ignore hard I/O errors during device removal. When set, if a device encounters a hard I/O error during the removal process the removal will not be cancelled. This can result in a normally recoverable block becoming permanently damaged and is hence not recommended. This should only be used as a last resort when the pool cannot be returned to a healthy state prior to removing the device. . .It Sy zfs_removal_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint This is used by the test suite so that it can ensure that certain actions happen while in the middle of a removal. . .It Sy zfs_remove_max_segment Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint The largest contiguous segment that we will attempt to allocate when removing a device. If there is a performance problem with attempting to allocate large blocks, consider decreasing this. The default value is also the maximum. . .It Sy zfs_resilver_disable_defer Ns = Ns Sy 0 Ns | Ns 1 Pq int Ignore the .Sy resilver_defer feature, causing an operation that would start a resilver to immediately restart the one in progress. . .It Sy zfs_resilver_min_time_ms Ns = Ns Sy 3000 Ns ms Po 3 s Pc Pq uint Resilvers are processed by the sync thread. While resilvering, it will spend at least this much time working on a resilver between TXG flushes. . .It Sy zfs_scan_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int If set, remove the DTL (dirty time list) upon completion of a pool scan (scrub), even if there were unrepairable errors. Intended to be used during pool repair or recovery to stop resilvering when the pool is next imported. . .It Sy zfs_scrub_after_expand Ns = Ns Sy 1 Ns | Ns 0 Pq int Automatically start a pool scrub after a RAIDZ expansion completes in order to verify the checksums of all blocks which have been copied during the expansion. This is enabled by default and strongly recommended. . .It Sy zfs_scrub_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq uint Scrubs are processed by the sync thread. While scrubbing, it will spend at least this much time working on a scrub between TXG flushes. . .It Sy zfs_scrub_error_blocks_per_txg Ns = Ns Sy 4096 Pq uint Error blocks to be scrubbed in one txg. . .It Sy zfs_scan_checkpoint_intval Ns = Ns Sy 7200 Ns s Po 2 hour Pc Pq uint To preserve progress across reboots, the sequential scan algorithm periodically needs to stop metadata scanning and issue all the verification I/O to disk. The frequency of this flushing is determined by this tunable. . .It Sy zfs_scan_fill_weight Ns = Ns Sy 3 Pq uint This tunable affects how scrub and resilver I/O segments are ordered. A higher number indicates that we care more about how filled in a segment is, while a lower number indicates we care more about the size of the extent without considering the gaps within a segment. This value is only tunable upon module insertion. Changing the value afterwards will have no effect on scrub or resilver performance. . .It Sy zfs_scan_issue_strategy Ns = Ns Sy 0 Pq uint Determines the order that data will be verified while scrubbing or resilvering: .Bl -tag -compact -offset 4n -width "a" .It Sy 1 Data will be verified as sequentially as possible, given the amount of memory reserved for scrubbing .Pq see Sy zfs_scan_mem_lim_fact . This may improve scrub performance if the pool's data is very fragmented. .It Sy 2 The largest mostly-contiguous chunk of found data will be verified first. By deferring scrubbing of small segments, we may later find adjacent data to coalesce and increase the segment size. .It Sy 0 .No Use strategy Sy 1 No during normal verification .No and strategy Sy 2 No while taking a checkpoint . .El . .It Sy zfs_scan_legacy Ns = Ns Sy 0 Ns | Ns 1 Pq int If unset, indicates that scrubs and resilvers will gather metadata in memory before issuing sequential I/O. Otherwise indicates that the legacy algorithm will be used, where I/O is initiated as soon as it is discovered. Unsetting will not affect scrubs or resilvers that are already in progress. . .It Sy zfs_scan_max_ext_gap Ns = Ns Sy 2097152 Ns B Po 2 MiB Pc Pq int Sets the largest gap in bytes between scrub/resilver I/O operations that will still be considered sequential for sorting purposes. Changing this value will not affect scrubs or resilvers that are already in progress. . .It Sy zfs_scan_mem_lim_fact Ns = Ns Sy 20 Ns ^-1 Pq uint Maximum fraction of RAM used for I/O sorting by sequential scan algorithm. This tunable determines the hard limit for I/O sorting memory usage. When the hard limit is reached we stop scanning metadata and start issuing data verification I/O. This is done until we get below the soft limit. . .It Sy zfs_scan_mem_lim_soft_fact Ns = Ns Sy 20 Ns ^-1 Pq uint The fraction of the hard limit used to determined the soft limit for I/O sorting by the sequential scan algorithm. When we cross this limit from below no action is taken. When we cross this limit from above it is because we are issuing verification I/O. In this case (unless the metadata scan is done) we stop issuing verification I/O and start scanning metadata again until we get to the hard limit. . .It Sy zfs_scan_report_txgs Ns = Ns Sy 0 Ns | Ns 1 Pq uint When reporting resilver throughput and estimated completion time use the performance observed over roughly the last .Sy zfs_scan_report_txgs TXGs. When set to zero performance is calculated over the time between checkpoints. . .It Sy zfs_scan_strict_mem_lim Ns = Ns Sy 0 Ns | Ns 1 Pq int Enforce tight memory limits on pool scans when a sequential scan is in progress. When disabled, the memory limit may be exceeded by fast disks. . .It Sy zfs_scan_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq int Freezes a scrub/resilver in progress without actually pausing it. Intended for testing/debugging. . .It Sy zfs_scan_vdev_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int Maximum amount of data that can be concurrently issued at once for scrubs and resilvers per leaf device, given in bytes. . .It Sy zfs_send_corrupt_data Ns = Ns Sy 0 Ns | Ns 1 Pq int Allow sending of corrupt data (ignore read/checksum errors when sending). . .It Sy zfs_send_unmodified_spill_blocks Ns = Ns Sy 1 Ns | Ns 0 Pq int Include unmodified spill blocks in the send stream. Under certain circumstances, previous versions of ZFS could incorrectly remove the spill block from an existing object. Including unmodified copies of the spill blocks creates a backwards-compatible stream which will recreate a spill block if it was incorrectly removed. . .It Sy zfs_send_no_prefetch_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint The fill fraction of the .Nm zfs Cm send internal queues. The fill fraction controls the timing with which internal threads are woken up. . .It Sy zfs_send_no_prefetch_queue_length Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint The maximum number of bytes allowed in .Nm zfs Cm send Ns 's internal queues. . .It Sy zfs_send_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint The fill fraction of the .Nm zfs Cm send prefetch queue. The fill fraction controls the timing with which internal threads are woken up. . .It Sy zfs_send_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint The maximum number of bytes allowed that will be prefetched by .Nm zfs Cm send . This value must be at least twice the maximum block size in use. . .It Sy zfs_recv_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint The fill fraction of the .Nm zfs Cm receive queue. The fill fraction controls the timing with which internal threads are woken up. . .It Sy zfs_recv_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint The maximum number of bytes allowed in the .Nm zfs Cm receive queue. This value must be at least twice the maximum block size in use. . .It Sy zfs_recv_write_batch_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint The maximum amount of data, in bytes, that .Nm zfs Cm receive will write in one DMU transaction. This is the uncompressed size, even when receiving a compressed send stream. This setting will not reduce the write size below a single block. Capped at a maximum of .Sy 32 MiB . . .It Sy zfs_recv_best_effort_corrective Ns = Ns Sy 0 Pq int When this variable is set to non-zero a corrective receive: .Bl -enum -compact -offset 4n -width "1." .It Does not enforce the restriction of source & destination snapshot GUIDs matching. .It If there is an error during healing, the healing receive is not terminated instead it moves on to the next record. .El . .It Sy zfs_override_estimate_recordsize Ns = Ns Sy 0 Ns | Ns 1 Pq uint Setting this variable overrides the default logic for estimating block sizes when doing a .Nm zfs Cm send . The default heuristic is that the average block size will be the current recordsize. Override this value if most data in your dataset is not of that size and you require accurate zfs send size estimates. . .It Sy zfs_sync_pass_deferred_free Ns = Ns Sy 2 Pq uint Flushing of data to disk is done in passes. Defer frees starting in this pass. . .It Sy zfs_spa_discard_memory_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int Maximum memory used for prefetching a checkpoint's space map on each vdev while discarding the checkpoint. . .It Sy zfs_special_class_metadata_reserve_pct Ns = Ns Sy 25 Ns % Pq uint Only allow small data blocks to be allocated on the special and dedup vdev types when the available free space percentage on these vdevs exceeds this value. This ensures reserved space is available for pool metadata as the special vdevs approach capacity. . .It Sy zfs_sync_pass_dont_compress Ns = Ns Sy 8 Pq uint Starting in this sync pass, disable compression (including of metadata). With the default setting, in practice, we don't have this many sync passes, so this has no effect. .Pp The original intent was that disabling compression would help the sync passes to converge. However, in practice, disabling compression increases the average number of sync passes; because when we turn compression off, many blocks' size will change, and thus we have to re-allocate (not overwrite) them. It also increases the number of .Em 128 KiB allocations (e.g. for indirect blocks and spacemaps) because these will not be compressed. The .Em 128 KiB allocations are especially detrimental to performance on highly fragmented systems, which may have very few free segments of this size, and may need to load new metaslabs to satisfy these allocations. . .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint Rewrite new block pointers starting in this pass. . .It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Maximum size of TRIM command. Larger ranges will be split into chunks no larger than this value before issuing. . .It Sy zfs_trim_extent_bytes_min Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint Minimum size of TRIM commands. TRIM ranges smaller than this will be skipped, unless they're part of a larger range which was chunked. This is done because it's common for these small TRIMs to negatively impact overall performance. . .It Sy zfs_trim_metaslab_skip Ns = Ns Sy 0 Ns | Ns 1 Pq uint Skip uninitialized metaslabs during the TRIM process. This option is useful for pools constructed from large thinly-provisioned devices where TRIM operations are slow. As a pool ages, an increasing fraction of the pool's metaslabs will be initialized, progressively degrading the usefulness of this option. This setting is stored when starting a manual TRIM and will persist for the duration of the requested TRIM. . .It Sy zfs_trim_queue_limit Ns = Ns Sy 10 Pq uint Maximum number of queued TRIMs outstanding per leaf vdev. The number of concurrent TRIM commands issued to the device is controlled by .Sy zfs_vdev_trim_min_active No and Sy zfs_vdev_trim_max_active . . .It Sy zfs_trim_txg_batch Ns = Ns Sy 32 Pq uint The number of transaction groups' worth of frees which should be aggregated before TRIM operations are issued to the device. This setting represents a trade-off between issuing larger, more efficient TRIM operations and the delay before the recently trimmed space is available for use by the device. .Pp Increasing this value will allow frees to be aggregated for a longer time. This will result is larger TRIM operations and potentially increased memory usage. Decreasing this value will have the opposite effect. The default of .Sy 32 was determined to be a reasonable compromise. . .It Sy zfs_txg_history Ns = Ns Sy 100 Pq uint Historical statistics for this many latest TXGs will be available in .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /TXGs . . .It Sy zfs_txg_timeout Ns = Ns Sy 5 Ns s Pq uint Flush dirty data to disk at least every this many seconds (maximum TXG duration). . .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint Max vdev I/O aggregation size. . .It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint Max vdev I/O aggregation size for non-rotating media. . .It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation immediately follows its predecessor on rotational vdevs for the purpose of making decisions based on load. . .It Sy zfs_vdev_mirror_rotating_seek_inc Ns = Ns Sy 5 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation lacks locality as defined by .Sy zfs_vdev_mirror_rotating_seek_offset . Operations within this that are not immediately following the previous operation are incremented by half. . .It Sy zfs_vdev_mirror_rotating_seek_offset Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq int The maximum distance for the last queued I/O operation in which the balancing algorithm considers an operation to have locality. .No See Sx ZFS I/O SCHEDULER . . .It Sy zfs_vdev_mirror_non_rotating_inc Ns = Ns Sy 0 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member on non-rotational vdevs when I/O operations do not immediately follow one another. . .It Sy zfs_vdev_mirror_non_rotating_seek_inc Ns = Ns Sy 1 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation lacks locality as defined by the .Sy zfs_vdev_mirror_rotating_seek_offset . Operations within this that are not immediately following the previous operation are incremented by half. . .It Sy zfs_vdev_read_gap_limit Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint Aggregate read I/O operations if the on-disk gap between them is within this threshold. . .It Sy zfs_vdev_write_gap_limit Ns = Ns Sy 4096 Ns B Po 4 KiB Pc Pq uint Aggregate write I/O operations if the on-disk gap between them is within this threshold. . .It Sy zfs_vdev_raidz_impl Ns = Ns Sy fastest Pq string Select the raidz parity implementation to use. .Pp Variants that don't depend on CPU-specific features may be selected on module load, as they are supported on all systems. The remaining options may only be set after the module is loaded, as they are available only if the implementations are compiled in and supported on the running system. .Pp Once the module is loaded, .Pa /sys/module/zfs/parameters/zfs_vdev_raidz_impl will show the available options, with the currently selected one enclosed in square brackets. .Pp .TS lb l l . fastest selected by built-in benchmark original original implementation scalar scalar implementation sse2 SSE2 instruction set 64-bit x86 ssse3 SSSE3 instruction set 64-bit x86 avx2 AVX2 instruction set 64-bit x86 avx512f AVX512F instruction set 64-bit x86 avx512bw AVX512F & AVX512BW instruction sets 64-bit x86 aarch64_neon NEON Aarch64/64-bit ARMv8 aarch64_neonx2 NEON with more unrolling Aarch64/64-bit ARMv8 powerpc_altivec Altivec PowerPC .TE . .It Sy zfs_vdev_scheduler Pq charp .Sy DEPRECATED . Prints warning to kernel log for compatibility. . .It Sy zfs_zevent_len_max Ns = Ns Sy 512 Pq uint Max event queue length. Events in the queue can be viewed with .Xr zpool-events 8 . . .It Sy zfs_zevent_retain_max Ns = Ns Sy 2000 Pq int Maximum recent zevent records to retain for duplicate checking. Setting this to .Sy 0 disables duplicate detection. . .It Sy zfs_zevent_retain_expire_secs Ns = Ns Sy 900 Ns s Po 15 min Pc Pq int Lifespan for a recent ereport that was retained for duplicate checking. . .It Sy zfs_zil_clean_taskq_maxalloc Ns = Ns Sy 1048576 Pq int The maximum number of taskq entries that are allowed to be cached. When this limit is exceeded transaction records (itxs) will be cleaned synchronously. . .It Sy zfs_zil_clean_taskq_minalloc Ns = Ns Sy 1024 Pq int The number of taskq entries that are pre-populated when the taskq is first created and are immediately available for use. . .It Sy zfs_zil_clean_taskq_nthr_pct Ns = Ns Sy 100 Ns % Pq int This controls the number of threads used by .Sy dp_zil_clean_taskq . The default value of .Sy 100% will create a maximum of one thread per cpu. . .It Sy zil_maxblocksize Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint This sets the maximum block size used by the ZIL. On very fragmented pools, lowering this .Pq typically to Sy 36 KiB can improve performance. . .It Sy zil_maxcopied Ns = Ns Sy 7680 Ns B Po 7.5 KiB Pc Pq uint This sets the maximum number of write bytes logged via WR_COPIED. It tunes a tradeoff between additional memory copy and possibly worse log space efficiency vs additional range lock/unlock. . .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable the cache flush commands that are normally sent to disk by the ZIL after an LWB write has completed. Setting this will cause ZIL corruption on power loss if a volatile out-of-order write cache is enabled. . .It Sy zil_replay_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int Disable intent logging replay. Can be disabled for recovery from corrupted ZIL. . .It Sy zil_slog_bulk Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64 Limit SLOG write size per commit executed with synchronous priority. Any writes above that will be executed with lower (asynchronous) priority to limit potential SLOG device abuse by single active ZIL writer. . .It Sy zfs_zil_saxattr Ns = Ns Sy 1 Ns | Ns 0 Pq int Setting this tunable to zero disables ZIL logging of new .Sy xattr Ns = Ns Sy sa records if the .Sy org.openzfs:zilsaxattr feature is enabled on the pool. This would only be necessary to work around bugs in the ZIL logging or replay code for this record type. The tunable has no effect if the feature is disabled. . .It Sy zfs_embedded_slog_min_ms Ns = Ns Sy 64 Pq uint Usually, one metaslab from each normal-class vdev is dedicated for use by the ZIL to log synchronous writes. However, if there are fewer than .Sy zfs_embedded_slog_min_ms metaslabs in the vdev, this functionality is disabled. This ensures that we don't set aside an unreasonable amount of space for the ZIL. . .It Sy zstd_earlyabort_pass Ns = Ns Sy 1 Pq uint Whether heuristic for detection of incompressible data with zstd levels >= 3 using LZ4 and zstd-1 passes is enabled. . .It Sy zstd_abort_size Ns = Ns Sy 131072 Pq uint Minimal uncompressed size (inclusive) of a record before the early abort heuristic will be attempted. . .It Sy zio_deadman_log_all Ns = Ns Sy 0 Ns | Ns 1 Pq int If non-zero, the zio deadman will produce debugging messages .Pq see Sy zfs_dbgmsg_enable for all zios, rather than only for leaf zios possessing a vdev. This is meant to be used by developers to gain diagnostic information for hang conditions which don't involve a mutex or other locking primitive: typically conditions in which a thread in the zio pipeline is looping indefinitely. . .It Sy zio_slow_io_ms Ns = Ns Sy 30000 Ns ms Po 30 s Pc Pq int When an I/O operation takes more than this much time to complete, it's marked as slow. Each slow operation causes a delay zevent. Slow I/O counters can be seen with .Nm zpool Cm status Fl s . . .It Sy zio_dva_throttle_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int Throttle block allocations in the I/O pipeline. This allows for dynamic allocation distribution when devices are imbalanced. When enabled, the maximum number of pending allocations per top-level vdev is limited by .Sy zfs_vdev_queue_depth_pct . . .It Sy zfs_xattr_compat Ns = Ns 0 Ns | Ns 1 Pq int Control the naming scheme used when setting new xattrs in the user namespace. If .Sy 0 .Pq the default on Linux , user namespace xattr names are prefixed with the namespace, to be backwards compatible with previous versions of ZFS on Linux. If .Sy 1 .Pq the default on Fx , user namespace xattr names are not prefixed, to be backwards compatible with previous versions of ZFS on illumos and .Fx . .Pp Either naming scheme can be read on this and future versions of ZFS, regardless of this tunable, but legacy ZFS on illumos or .Fx are unable to read user namespace xattrs written in the Linux format, and legacy versions of ZFS on Linux are unable to read user namespace xattrs written in the legacy ZFS format. .Pp An existing xattr with the alternate naming scheme is removed when overwriting the xattr so as to not accumulate duplicates. . .It Sy zio_requeue_io_start_cut_in_line Ns = Ns Sy 0 Ns | Ns 1 Pq int Prioritize requeued I/O. . .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint Percentage of online CPUs which will run a worker thread for I/O. These workers are responsible for I/O work such as compression, encryption, checksum and parity calculations. Fractional number of CPUs will be rounded down. .Pp The default value of .Sy 80% was chosen to avoid using all CPUs which can result in latency issues and inconsistent application performance, especially when slower compression and/or checksumming is enabled. Set value only applies to pools imported/created after that. . .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint Number of worker threads per taskq. Higher values improve I/O ordering and CPU utilization, while lower reduce lock contention. Set value only applies to pools imported/created after that. .Pp If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. Set value only applies to pools imported/created after that. . .It Sy zio_taskq_write_tpq Ns = Ns Sy 16 Pq uint Determines the minumum number of threads per write issue taskq. Higher values improve CPU utilization on high throughput, while lower reduce taskq locks contention on high IOPS. Set value only applies to pools imported/created after that. . .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp Set the queue and thread configuration for the IO read queues. This is an advanced debugging parameter. Don't change this unless you understand what it does. Set values only apply to pools imported/created after that. . .It Sy zio_taskq_write Ns = Ns Sy sync null scale null Pq charp Set the queue and thread configuration for the IO write queues. This is an advanced debugging parameter. Don't change this unless you understand what it does. Set values only apply to pools imported/created after that. . .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. This may slightly improve startup time on systems with a very large number of zvols. . .It Sy zvol_major Ns = Ns Sy 230 Pq uint Major number for zvol block devices. . .It Sy zvol_max_discard_blocks Ns = Ns Sy 16384 Pq long Discard (TRIM) operations done on zvols will be done in batches of this many blocks, where block size is determined by the .Sy volblocksize property of a zvol. . .It Sy zvol_prefetch_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint When adding a zvol to the system, prefetch this many bytes from the start and end of the volume. Prefetching these regions of the volume is desirable, because they are likely to be accessed immediately by .Xr blkid 8 or the kernel partitioner. . .It Sy zvol_request_sync Ns = Ns Sy 0 Ns | Ns 1 Pq uint When processing I/O requests for a zvol, submit them synchronously. This effectively limits the queue depth to .Em 1 for each I/O submitter. When unset, requests are handled asynchronously by a thread pool. The number of requests which can be handled concurrently is controlled by .Sy zvol_threads . .Sy zvol_request_sync is ignored when running on a kernel that supports block multiqueue .Pq Li blk-mq . . .It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint Number of zvol taskqs. If .Sy 0 (the default) then scaling is done internally to prefer 6 threads per taskq. This only applies on Linux. . .It Sy zvol_threads Ns = Ns Sy 0 Pq uint The number of system wide threads to use for processing zvol block IOs. If .Sy 0 (the default) then internally set .Sy zvol_threads to the number of CPUs present or 32 (whichever is greater). . .It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint The number of threads per zvol to use for queuing IO requests. This parameter will only appear if your kernel supports .Li blk-mq and is only read and assigned to a zvol at zvol load time. If .Sy 0 (the default) then internally set .Sy zvol_blk_mq_threads to the number of CPUs present. . .It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint Set to .Sy 1 to use the .Li blk-mq API for zvols. Set to .Sy 0 (the default) to use the legacy zvol APIs. This setting can give better or worse zvol performance depending on the workload. This parameter will only appear if your kernel supports .Li blk-mq and is only read and assigned to a zvol at zvol load time. . .It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint If .Sy zvol_use_blk_mq is enabled, then process this number of .Sy volblocksize Ns -sized blocks per zvol thread. This tunable can be use to favor better performance for zvol reads (lower values) or writes (higher values). If set to .Sy 0 , then the zvol layer will process the maximum number of blocks per thread that it can. This parameter will only appear if your kernel supports .Li blk-mq and is only applied at each zvol's load time. . .It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint The queue_depth value for the zvol .Li blk-mq interface. This parameter will only appear if your kernel supports .Li blk-mq and is only applied at each zvol's load time. If .Sy 0 (the default) then use the kernel's default queue depth. Values are clamped to the kernel's .Dv BLKDEV_MIN_RQ and .Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ limits. . .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint Defines zvol block devices behaviour when .Sy volmode Ns = Ns Sy default : .Bl -tag -compact -offset 4n -width "a" .It Sy 1 .No equivalent to Sy full .It Sy 2 .No equivalent to Sy dev .It Sy 3 .No equivalent to Sy none .El . .It Sy zvol_enforce_quotas Ns = Ns Sy 0 Ns | Ns 1 Pq uint Enable strict ZVOL quota enforcement. The strict quota enforcement may have a performance impact. .El . .Sh ZFS I/O SCHEDULER ZFS issues I/O operations to leaf vdevs to satisfy and complete I/O operations. The scheduler determines when and in what order those operations are issued. The scheduler divides operations into five I/O classes, prioritized in the following order: sync read, sync write, async read, async write, and scrub/resilver. Each queue defines the minimum and maximum number of concurrent operations that may be issued to the device. In addition, the device has an aggregate maximum, .Sy zfs_vdev_max_active . Note that the sum of the per-queue minima must not exceed the aggregate maximum. If the sum of the per-queue maxima exceeds the aggregate maximum, then the number of active operations may reach .Sy zfs_vdev_max_active , in which case no further operations will be issued, regardless of whether all per-queue minima have been met. .Pp For many physical devices, throughput increases with the number of concurrent operations, but latency typically suffers. Furthermore, physical devices typically have a limit at which more concurrent operations have no effect on throughput or can actually cause it to decrease. .Pp The scheduler selects the next operation to issue by first looking for an I/O class whose minimum has not been satisfied. Once all are satisfied and the aggregate maximum has not been hit, the scheduler looks for classes whose maximum has not been satisfied. Iteration through the I/O classes is done in the order specified above. No further operations are issued if the aggregate maximum number of concurrent operations has been hit, or if there are no operations queued for an I/O class that has not hit its maximum. Every time an I/O operation is queued or an operation completes, the scheduler looks for new operations to issue. .Pp In general, smaller .Sy max_active Ns s will lead to lower latency of synchronous operations. Larger .Sy max_active Ns s may lead to higher overall throughput, depending on underlying storage. .Pp The ratio of the queues' .Sy max_active Ns s determines the balance of performance between reads, writes, and scrubs. For example, increasing .Sy zfs_vdev_scrub_max_active will cause the scrub or resilver to complete more quickly, but reads and writes to have higher latency and lower throughput. .Pp All I/O classes have a fixed maximum number of outstanding operations, except for the async write class. Asynchronous writes represent the data that is committed to stable storage during the syncing stage for transaction groups. Transaction groups enter the syncing state periodically, so the number of queued async writes will quickly burst up and then bleed down to zero. Rather than servicing them as quickly as possible, the I/O scheduler changes the maximum number of active async write operations according to the amount of dirty data in the pool. Since both throughput and latency typically increase with the number of concurrent operations issued to physical devices, reducing the burstiness in the number of simultaneous operations also stabilizes the response time of operations from other queues, in particular synchronous ones. In broad strokes, the I/O scheduler will issue more concurrent operations from the async write queue as there is more dirty data in the pool. . .Ss Async Writes The number of concurrent operations issued for the async write I/O class follows a piece-wise linear function defined by a few adjustable points: .Bd -literal | o---------| <-- \fBzfs_vdev_async_write_max_active\fP ^ | /^ | | | / | | active | / | | I/O | / | | count | / | | | / | | |-------o | | <-- \fBzfs_vdev_async_write_min_active\fP 0|_______^______|_________| 0% | | 100% of \fBzfs_dirty_data_max\fP | | | `-- \fBzfs_vdev_async_write_active_max_dirty_percent\fP `--------- \fBzfs_vdev_async_write_active_min_dirty_percent\fP .Ed .Pp Until the amount of dirty data exceeds a minimum percentage of the dirty data allowed in the pool, the I/O scheduler will limit the number of concurrent operations to the minimum. As that threshold is crossed, the number of concurrent operations issued increases linearly to the maximum at the specified maximum percentage of the dirty data allowed in the pool. .Pp Ideally, the amount of dirty data on a busy pool will stay in the sloped part of the function between .Sy zfs_vdev_async_write_active_min_dirty_percent and .Sy zfs_vdev_async_write_active_max_dirty_percent . If it exceeds the maximum percentage, this indicates that the rate of incoming data is greater than the rate that the backend storage can handle. In this case, we must further throttle incoming writes, as described in the next section. . .Sh ZFS TRANSACTION DELAY We delay transactions when we've determined that the backend storage isn't able to accommodate the rate of incoming writes. .Pp If there is already a transaction waiting, we delay relative to when that transaction will finish waiting. This way the calculated delay time is independent of the number of threads concurrently executing transactions. .Pp If we are the only waiter, wait relative to when the transaction started, rather than the current time. This credits the transaction for "time already served", e.g. reading indirect blocks. .Pp The minimum time for a transaction to take is calculated as .D1 min_time = min( Ns Sy zfs_delay_scale No \(mu Po Sy dirty No \- Sy min Pc / Po Sy max No \- Sy dirty Pc , 100ms) .Pp The delay has two degrees of freedom that can be adjusted via tunables. The percentage of dirty data at which we start to delay is defined by .Sy zfs_delay_min_dirty_percent . This should typically be at or above .Sy zfs_vdev_async_write_active_max_dirty_percent , so that we only start to delay after writing at full speed has failed to keep up with the incoming write rate. The scale of the curve is defined by .Sy zfs_delay_scale . Roughly speaking, this variable determines the amount of delay at the midpoint of the curve. .Bd -literal delay 10ms +-------------------------------------------------------------*+ | *| 9ms + *+ | *| 8ms + *+ | * | 7ms + * + | * | 6ms + * + | * | 5ms + * + | * | 4ms + * + | * | 3ms + * + | * | 2ms + (midpoint) * + | | ** | 1ms + v *** + | \fBzfs_delay_scale\fP ----------> ******** | 0 +-------------------------------------*********----------------+ 0% <- \fBzfs_dirty_data_max\fP -> 100% .Ed .Pp Note, that since the delay is added to the outstanding time remaining on the most recent transaction it's effectively the inverse of IOPS. Here, the midpoint of .Em 500 us translates to .Em 2000 IOPS . The shape of the curve was chosen such that small changes in the amount of accumulated dirty data in the first three quarters of the curve yield relatively small differences in the amount of delay. .Pp The effects can be easier to understand when the amount of delay is represented on a logarithmic scale: .Bd -literal delay 100ms +-------------------------------------------------------------++ + + | | + *+ 10ms + *+ + ** + | (midpoint) ** | + | ** + 1ms + v **** + + \fBzfs_delay_scale\fP ----------> ***** + | **** | + **** + 100us + ** + + * + | * | + * + 10us + * + + + | | + + +--------------------------------------------------------------+ 0% <- \fBzfs_dirty_data_max\fP -> 100% .Ed .Pp Note here that only as the amount of dirty data approaches its limit does the delay start to increase rapidly. The goal of a properly tuned system should be to keep the amount of dirty data out of that range by first ensuring that the appropriate limits are set for the I/O scheduler to reach optimal throughput on the back-end storage, and then by changing the value of .Sy zfs_delay_scale to increase the steepness of the curve. diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index ad9755ba50a4..7b392a896150 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -1,1043 +1,1060 @@ .\" .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at .\" usr/src/OPENSOLARIS.LICENSE or https://opensource.org/licenses/CDDL-1.0. .\" .\" See the License for the specific language governing permissions and .\" limitations under the License. When distributing Covered Code, include this .\" CDDL HEADER in each file and include the License file at .\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] -.\" Copyright (c) 2019, Klara Inc. +.\" Copyright (c) 2019, 2023, 2024, Klara, Inc. .\" Copyright (c) 2019, Allan Jude .\" Copyright (c) 2021, Colm Buckley -.\" Copyright (c) 2023, Klara Inc. .\" -.Dd February 14, 2024 +.Dd October 2, 2024 .Dt ZPOOL-FEATURES 7 .Os . .Sh NAME .Nm zpool-features .Nd description of ZFS pool features . .Sh DESCRIPTION ZFS pool on-disk format versions are specified via .Dq features which replace the old on-disk format numbers .Pq the last supported on-disk format number is 28 . To enable a feature on a pool use the .Nm zpool Cm upgrade , or set the .Sy feature Ns @ Ns Ar feature-name property to .Sy enabled . Please also see the .Sx Compatibility feature sets section for information on how sets of features may be enabled together. .Pp The pool format does not affect file system version compatibility or the ability to send file systems between pools. .Pp Since most features can be enabled independently of each other, the on-disk format of the pool is specified by the set of all features marked as .Sy active on the pool. If the pool was created by another software version this set may include unsupported features. . .Ss Identifying features Every feature has a GUID of the form .Ar com.example : Ns Ar feature-name . The reversed DNS name ensures that the feature's GUID is unique across all ZFS implementations. When unsupported features are encountered on a pool they will be identified by their GUIDs. Refer to the documentation for the ZFS implementation that created the pool for information about those features. .Pp Each supported feature also has a short name. By convention a feature's short name is the portion of its GUID which follows the .Sq \&: .Po i.e. .Ar com.example : Ns Ar feature-name would have the short name .Ar feature-name .Pc , however a feature's short name may differ across ZFS implementations if following the convention would result in name conflicts. . .Ss Feature states Features can be in one of three states: .Bl -tag -width "disabled" .It Sy active This feature's on-disk format changes are in effect on the pool. Support for this feature is required to import the pool in read-write mode. If this feature is not read-only compatible, support is also required to import the pool in read-only mode .Pq see Sx Read-only compatibility . .It Sy enabled An administrator has marked this feature as enabled on the pool, but the feature's on-disk format changes have not been made yet. The pool can still be imported by software that does not support this feature, but changes may be made to the on-disk format at any time which will move the feature to the .Sy active state. Some features may support returning to the .Sy enabled state after becoming .Sy active . See feature-specific documentation for details. .It Sy disabled This feature's on-disk format changes have not been made and will not be made unless an administrator moves the feature to the .Sy enabled state. Features cannot be disabled once they have been enabled. .El .Pp The state of supported features is exposed through pool properties of the form .Sy feature Ns @ Ns Ar short-name . . .Ss Read-only compatibility Some features may make on-disk format changes that do not interfere with other software's ability to read from the pool. These features are referred to as .Dq read-only compatible . If all unsupported features on a pool are read-only compatible, the pool can be imported in read-only mode by setting the .Sy readonly property during import .Po see .Xr zpool-import 8 for details on importing pools .Pc . . .Ss Unsupported features For each unsupported feature enabled on an imported pool, a pool property named .Sy unsupported Ns @ Ns Ar feature-name will indicate why the import was allowed despite the unsupported feature. Possible values for this property are: .Bl -tag -width "readonly" .It Sy inactive The feature is in the .Sy enabled state and therefore the pool's on-disk format is still compatible with software that does not support this feature. .It Sy readonly The feature is read-only compatible and the pool has been imported in read-only mode. .El . .Ss Feature dependencies Some features depend on other features being enabled in order to function. Enabling a feature will automatically enable any features it depends on. . .Ss Compatibility feature sets It is sometimes necessary for a pool to maintain compatibility with a specific on-disk format, by enabling and disabling particular features. The .Sy compatibility feature facilitates this by allowing feature sets to be read from text files. When set to .Sy off .Pq the default , compatibility feature sets are disabled .Pq i.e. all features are enabled ; when set to .Sy legacy , no features are enabled. When set to a comma-separated list of filenames .Po each filename may either be an absolute path, or relative to .Pa /etc/zfs/compatibility.d or .Pa /usr/share/zfs/compatibility.d .Pc , the lists of requested features are read from those files, separated by whitespace and/or commas. Only features present in all files are enabled. .Pp Simple sanity checks are applied to the files: they must be between 1 B and 16 KiB in size, and must end with a newline character. .Pp The requested features are applied when a pool is created using .Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar … and controls which features are enabled when using .Nm zpool Cm upgrade . .Nm zpool Cm status will not show a warning about disabled features which are not part of the requested feature set. .Pp The special value .Sy legacy prevents any features from being enabled, either via .Nm zpool Cm upgrade or .Nm zpool Cm set Sy feature Ns @ Ns Ar feature-name Ns = Ns Sy enabled . This setting also prevents pools from being upgraded to newer on-disk versions. This is a safety measure to prevent new features from being accidentally enabled, breaking compatibility. .Pp By convention, compatibility files in .Pa /usr/share/zfs/compatibility.d are provided by the distribution, and include feature sets supported by important versions of popular distributions, and feature sets commonly supported at the start of each year. Compatibility files in .Pa /etc/zfs/compatibility.d , if present, will take precedence over files with the same name in .Pa /usr/share/zfs/compatibility.d . .Pp If an unrecognized feature is found in these files, an error message will be shown. If the unrecognized feature is in a file in .Pa /etc/zfs/compatibility.d , this is treated as an error and processing will stop. If the unrecognized feature is under .Pa /usr/share/zfs/compatibility.d , this is treated as a warning and processing will continue. This difference is to allow distributions to include features which might not be recognized by the currently-installed binaries. .Pp Compatibility files may include comments: any text from .Sq # to the end of the line is ignored. .Pp .Sy Example : .Bd -literal -compact -offset 4n .No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2 # Features which are supported by GRUB2 versions from v2.12 onwards. allocation_classes async_destroy block_cloning bookmarks device_rebuild embedded_data empty_bpobj enabled_txg extensible_dataset filesystem_limits hole_birth large_blocks livelist log_spacemap lz4_compress project_quota resilver_defer spacemap_histogram spacemap_v2 userobj_accounting zilsaxattr zpool_checkpoint .No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2-2.06 # Features which are supported by GRUB2 versions prior to v2.12. # # GRUB is not able to detect ZFS pool if snaphsot of top level boot pool # is created. This issue is observed with GRUB versions before v2.12 if # extensible_dataset feature is enabled on ZFS boot pool. # # This file lists all read-only comaptible features except # extensible_dataset and any other feature that depends on it. # allocation_classes async_destroy block_cloning device_rebuild embedded_data empty_bpobj enabled_txg hole_birth log_spacemap lz4_compress resilver_defer spacemap_histogram spacemap_v2 zpool_checkpoint .No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev .Ed .Pp See .Xr zpool-create 8 and .Xr zpool-upgrade 8 for more information on how these commands are affected by feature sets. . .de feature .It Sy \\$2 .Bl -tag -compact -width "READ-ONLY COMPATIBLE" .It GUID .Sy \\$1:\\$2 .if !"\\$4"" \{\ .It DEPENDENCIES \fB\\$4\fP\c .if !"\\$5"" , \fB\\$5\fP\c .if !"\\$6"" , \fB\\$6\fP\c .if !"\\$7"" , \fB\\$7\fP\c .if !"\\$8"" , \fB\\$8\fP\c .if !"\\$9"" , \fB\\$9\fP\c .\} .It READ-ONLY COMPATIBLE \\$3 .El .Pp .. . .ds instant-never \ .No This feature becomes Sy active No as soon as it is enabled \ and will never return to being Sy enabled . . .ds remount-upgrade \ .No Each filesystem will be upgraded automatically when remounted, \ or when a new file is created under that filesystem. \ The upgrade can also be triggered on filesystems via \ Nm zfs Cm set Sy version Ns = Ns Sy current Ar fs . \ No The upgrade process runs in the background and may take a while to complete \ for filesystems containing large amounts of files . . .de checksum-spiel When the .Sy \\$1 feature is set to .Sy enabled , the administrator can turn on the .Sy \\$1 checksum on any dataset using .Nm zfs Cm set Sy checksum Ns = Ns Sy \\$1 Ar dset .Po see Xr zfs-set 8 Pc . This feature becomes .Sy active once a .Sy checksum property has been set to .Sy \\$1 , and will return to being .Sy enabled once all filesystems that have ever had their checksum set to .Sy \\$1 are destroyed. .. . .Sh FEATURES The following features are supported on this system: .Bl -tag -width Ds .feature org.zfsonlinux allocation_classes yes This feature enables support for separate allocation classes. .Pp This feature becomes .Sy active when a dedicated allocation class vdev .Pq dedup or special is created with the .Nm zpool Cm create No or Nm zpool Cm add No commands . With device removal, it can be returned to the .Sy enabled state if all the dedicated allocation class vdevs are removed. . .feature com.delphix async_destroy yes Destroying a file system requires traversing all of its data in order to return its used space to the pool. Without .Sy async_destroy , the file system is not fully removed until all space has been reclaimed. If the destroy operation is interrupted by a reboot or power outage, the next attempt to open the pool will need to complete the destroy operation synchronously. .Pp When .Sy async_destroy is enabled, the file system's data will be reclaimed by a background process, allowing the destroy operation to complete without traversing the entire file system. The background process is able to resume interrupted destroys after the pool has been opened, eliminating the need to finish interrupted destroys as part of the open operation. The amount of space remaining to be reclaimed by the background process is available through the .Sy freeing property. .Pp This feature is only .Sy active while .Sy freeing is non-zero. . .feature org.openzfs blake3 no extensible_dataset This feature enables the use of the BLAKE3 hash algorithm for checksum and dedup. BLAKE3 is a secure hash algorithm focused on high performance. .Pp .checksum-spiel blake3 . .feature com.fudosecurity block_cloning yes When this feature is enabled ZFS will use block cloning for operations like .Fn copy_file_range 2 . Block cloning allows to create multiple references to a single block. It is much faster than copying the data (as the actual data is neither read nor written) and takes no additional space. Blocks can be cloned across datasets under some conditions (like equal .Nm recordsize , the same master encryption key, etc.). ZFS tries its best to clone across datasets including encrypted ones. This is limited for various (nontrivial) reasons depending on the OS and/or ZFS internals. .Pp This feature becomes .Sy active when first block is cloned. When the last cloned block is freed, it goes back to the enabled state. .feature com.delphix bookmarks yes extensible_dataset This feature enables use of the .Nm zfs Cm bookmark command. .Pp This feature is .Sy active while any bookmarks exist in the pool. All bookmarks in the pool can be listed by running .Nm zfs Cm list Fl t Sy bookmark Fl r Ar poolname . . .feature com.datto bookmark_v2 no bookmark extensible_dataset This feature enables the creation and management of larger bookmarks which are needed for other features in ZFS. .Pp This feature becomes .Sy active when a v2 bookmark is created and will be returned to the .Sy enabled state when all v2 bookmarks are destroyed. . .feature com.delphix bookmark_written no bookmark extensible_dataset bookmark_v2 This feature enables additional bookmark accounting fields, enabling the .Sy written Ns # Ns Ar bookmark property .Pq space written since a bookmark and estimates of send stream sizes for incrementals from bookmarks. .Pp This feature becomes .Sy active when a bookmark is created and will be returned to the .Sy enabled state when all bookmarks with these fields are destroyed. . .feature org.openzfs device_rebuild yes This feature enables the ability for the .Nm zpool Cm attach and .Nm zpool Cm replace commands to perform sequential reconstruction .Pq instead of healing reconstruction when resilvering. .Pp Sequential reconstruction resilvers a device in LBA order without immediately verifying the checksums. Once complete, a scrub is started, which then verifies the checksums. This approach allows full redundancy to be restored to the pool in the minimum amount of time. This two-phase approach will take longer than a healing resilver when the time to verify the checksums is included. However, unless there is additional pool damage, no checksum errors should be reported by the scrub. This feature is incompatible with raidz configurations. . This feature becomes .Sy active while a sequential resilver is in progress, and returns to .Sy enabled when the resilver completes. . .feature com.delphix device_removal no This feature enables the .Nm zpool Cm remove command to remove top-level vdevs, evacuating them to reduce the total size of the pool. .Pp This feature becomes .Sy active when the .Nm zpool Cm remove command is used on a top-level vdev, and will never return to being .Sy enabled . . .feature org.openzfs draid no This feature enables use of the .Sy draid vdev type. dRAID is a variant of RAID-Z which provides integrated distributed hot spares that allow faster resilvering while retaining the benefits of RAID-Z. Data, parity, and spare space are organized in redundancy groups and distributed evenly over all of the devices. .Pp This feature becomes .Sy active when creating a pool which uses the .Sy draid vdev type, or when adding a new .Sy draid vdev to an existing pool. . .feature org.illumos edonr no extensible_dataset This feature enables the use of the Edon-R hash algorithm for checksum, including for nopwrite .Po if compression is also enabled, an overwrite of a block whose checksum matches the data being written will be ignored .Pc . In an abundance of caution, Edon-R requires verification when used with dedup: .Nm zfs Cm set Sy dedup Ns = Ns Sy edonr , Ns Sy verify .Po see Xr zfs-set 8 Pc . .Pp Edon-R is a very high-performance hash algorithm that was part of the NIST SHA-3 competition. It provides extremely high hash performance .Pq over 350% faster than SHA-256 , but was not selected because of its unsuitability as a general purpose secure hash algorithm. This implementation utilizes the new salted checksumming functionality in ZFS, which means that the checksum is pre-seeded with a secret 256-bit random key .Pq stored on the pool before being fed the data block to be checksummed. Thus the produced checksums are unique to a given pool, preventing hash collision attacks on systems with dedup. .Pp .checksum-spiel edonr . .feature com.delphix embedded_data no This feature improves the performance and compression ratio of highly-compressible blocks. Blocks whose contents can compress to 112 bytes or smaller can take advantage of this feature. .Pp When this feature is enabled, the contents of highly-compressible blocks are stored in the block .Dq pointer itself .Po a misnomer in this case, as it contains the compressed data, rather than a pointer to its location on disk .Pc . Thus the space of the block .Pq one sector, typically 512 B or 4 KiB is saved, and no additional I/O is needed to read and write the data block. . \*[instant-never] . .feature com.delphix empty_bpobj yes This feature increases the performance of creating and using a large number of snapshots of a single filesystem or volume, and also reduces the disk space required. .Pp When there are many snapshots, each snapshot uses many Block Pointer Objects .Pq bpobjs to track blocks associated with that snapshot. However, in common use cases, most of these bpobjs are empty. This feature allows us to create each bpobj on-demand, thus eliminating the empty bpobjs. .Pp This feature is .Sy active while there are any filesystems, volumes, or snapshots which were created after enabling this feature. . .feature com.delphix enabled_txg yes Once this feature is enabled, ZFS records the transaction group number in which new features are enabled. This has no user-visible impact, but other features may depend on this feature. .Pp This feature becomes .Sy active as soon as it is enabled and will never return to being .Sy enabled . . .feature com.datto encryption no bookmark_v2 extensible_dataset This feature enables the creation and management of natively encrypted datasets. .Pp This feature becomes .Sy active when an encrypted dataset is created and will be returned to the .Sy enabled state when all datasets that use this feature are destroyed. . .feature com.klarasystems fast_dedup yes This feature allows more advanced deduplication features to be enabled on new dedup tables. .Pp This feature will be .Sy active when the first deduplicated block is written after a new dedup table is created (ie after a new pool creation, or new checksum used on a dataset with .Sy dedup enabled). It will be returned to the .Sy enabled state when all deduplicated blocks using it are freed. . .feature com.delphix extensible_dataset no This feature allows more flexible use of internal ZFS data structures, and exists for other features to depend on. .Pp This feature will be .Sy active when the first dependent feature uses it, and will be returned to the .Sy enabled state when all datasets that use this feature are destroyed. . .feature com.joyent filesystem_limits yes extensible_dataset This feature enables filesystem and snapshot limits. These limits can be used to control how many filesystems and/or snapshots can be created at the point in the tree on which the limits are set. .Pp This feature is .Sy active once either of the limit properties has been set on a dataset and will never return to being .Sy enabled . . .feature com.delphix head_errlog no This feature enables the upgraded version of errlog, which required an on-disk error log format change. Now the error log of each head dataset is stored separately in the zap object and keyed by the head id. With this feature enabled, every dataset affected by an error block is listed in the output of .Nm zpool Cm status . In case of encrypted filesystems with unloaded keys we are unable to check their snapshots or clones for errors and these will not be reported. An "access denied" error will be reported. .Pp \*[instant-never] . .feature com.delphix hole_birth no enabled_txg This feature has/had bugs, the result of which is that, if you do a .Nm zfs Cm send Fl i .Pq or Fl R , No since it uses Fl i from an affected dataset, the receiving party will not see any checksum or other errors, but the resulting destination snapshot will not match the source. Its use by .Nm zfs Cm send Fl i has been disabled by default .Po see .Sy send_holes_without_birth_time in .Xr zfs 4 .Pc . .Pp This feature improves performance of incremental sends .Pq Nm zfs Cm send Fl i and receives for objects with many holes. The most common case of hole-filled objects is zvols. .Pp An incremental send stream from snapshot .Sy A No to snapshot Sy B contains information about every block that changed between .Sy A No and Sy B . Blocks which did not change between those snapshots can be identified and omitted from the stream using a piece of metadata called the .Dq block birth time , but birth times are not recorded for holes .Pq blocks filled only with zeroes . Since holes created after .Sy A No cannot be distinguished from holes created before Sy A , information about every hole in the entire filesystem or zvol is included in the send stream. .Pp For workloads where holes are rare this is not a problem. However, when incrementally replicating filesystems or zvols with many holes .Pq for example a zvol formatted with another filesystem a lot of time will be spent sending and receiving unnecessary information about holes that already exist on the receiving side. .Pp Once the .Sy hole_birth feature has been enabled the block birth times of all new holes will be recorded. Incremental sends between snapshots created after this feature is enabled will use this new metadata to avoid sending information about holes that already exist on the receiving side. .Pp \*[instant-never] . .feature org.open-zfs large_blocks no extensible_dataset This feature allows the record size on a dataset to be set larger than 128 KiB. .Pp This feature becomes .Sy active once a dataset contains a file with a block size larger than 128 KiB, and will return to being .Sy enabled once all filesystems that have ever had their recordsize larger than 128 KiB are destroyed. . .feature org.zfsonlinux large_dnode no extensible_dataset This feature allows the size of dnodes in a dataset to be set larger than 512 B. . This feature becomes .Sy active once a dataset contains an object with a dnode larger than 512 B, which occurs as a result of setting the .Sy dnodesize dataset property to a value other than .Sy legacy . The feature will return to being .Sy enabled once all filesystems that have ever contained a dnode larger than 512 B are destroyed. Large dnodes allow more data to be stored in the bonus buffer, thus potentially improving performance by avoiding the use of spill blocks. . +.feature com.klarasystems large_microzap yes extensible_dataset large_blocks +This feature allows "micro" ZAPs to grow larger than 128 KiB without being +upgraded to "fat" ZAPs. +.Pp +This feature becomes +.Sy active +the first time a micro ZAP grows larger than 128KiB. +It will only be returned to the +.Sy enabled +state when all datasets that ever had a large micro ZAP are destroyed. +.Pp +Note that even when this feature is enabled, micro ZAPs cannot grow larger +than 128 KiB without also changing the +.Sy zap_micro_max_size +module parameter. +See +.Xr zfs 4 . +. .feature com.delphix livelist yes extensible_dataset This feature allows clones to be deleted faster than the traditional method when a large number of random/sparse writes have been made to the clone. All blocks allocated and freed after a clone is created are tracked by the the clone's livelist which is referenced during the deletion of the clone. The feature is activated when a clone is created and remains .Sy active until all clones have been destroyed. . .feature com.delphix log_spacemap yes com.delphix:spacemap_v2 This feature improves performance for heavily-fragmented pools, especially when workloads are heavy in random-writes. It does so by logging all the metaslab changes on a single spacemap every TXG instead of scattering multiple writes to all the metaslab spacemaps. .Pp \*[instant-never] . .feature org.zfsonlinux longname no extensible_dataset This feature allows creating files and directories with name up to 1023 bytes in length. A new dataset property .Sy longname is also introduced to toggle longname support for each dataset individually. This property can be disabled even if it contains longname files. In such case, new file cannot be created with longname but existing longname files can still be looked up. .Pp This feature becomes .Sy active when a file name greater than 255 is created in a dataset, and returns to being .Sy enabled when all such datasets are destroyed. . .feature org.illumos lz4_compress no .Sy lz4 is a high-performance real-time compression algorithm that features significantly faster compression and decompression as well as a higher compression ratio than the older .Sy lzjb compression. Typically, .Sy lz4 compression is approximately 50% faster on compressible data and 200% faster on incompressible data than .Sy lzjb . It is also approximately 80% faster on decompression, while giving approximately a 10% better compression ratio. .Pp When the .Sy lz4_compress feature is set to .Sy enabled , the administrator can turn on .Sy lz4 compression on any dataset on the pool using the .Xr zfs-set 8 command. All newly written metadata will be compressed with the .Sy lz4 algorithm. .Pp \*[instant-never] . .feature com.joyent multi_vdev_crash_dump no This feature allows a dump device to be configured with a pool comprised of multiple vdevs. Those vdevs may be arranged in any mirrored or raidz configuration. .Pp When the .Sy multi_vdev_crash_dump feature is set to .Sy enabled , the administrator can use .Xr dumpadm 8 to configure a dump device on a pool comprised of multiple vdevs. .Pp Under .Fx and Linux this feature is unused, but registered for compatibility. New pools created on these systems will have the feature .Sy enabled but will never transition to .Sy active , as this functionality is not required for crash dump support. Existing pools where this feature is .Sy active can be imported. . .feature com.delphix obsolete_counts yes device_removal This feature is an enhancement of .Sy device_removal , which will over time reduce the memory used to track removed devices. When indirect blocks are freed or remapped, we note that their part of the indirect mapping is .Dq obsolete – no longer needed. .Pp This feature becomes .Sy active when the .Nm zpool Cm remove command is used on a top-level vdev, and will never return to being .Sy enabled . . .feature org.zfsonlinux project_quota yes extensible_dataset This feature allows administrators to account the spaces and objects usage information against the project identifier .Pq ID . .Pp The project ID is an object-based attribute. When upgrading an existing filesystem, objects without a project ID will be assigned a zero project ID. When this feature is enabled, newly created objects inherit their parent directories' project ID if the parent's inherit flag is set .Pq via Nm chattr Sy [+-]P No or Nm zfs Cm project Fl s Ns | Ns Fl C . Otherwise, the new object's project ID will be zero. An object's project ID can be changed at any time by the owner .Pq or privileged user via .Nm chattr Fl p Ar prjid or .Nm zfs Cm project Fl p Ar prjid . .Pp This feature will become .Sy active as soon as it is enabled and will never return to being .Sy disabled . \*[remount-upgrade] . .feature org.openzfs raidz_expansion no none This feature enables the .Nm zpool Cm attach subcommand to attach a new device to a RAID-Z group, expanding the total amount usable space in the pool. See .Xr zpool-attach 8 . . .feature com.delphix redaction_bookmarks no bookmarks extensible_dataset This feature enables the use of redacted .Nm zfs Cm send Ns s , which create redaction bookmarks storing the list of blocks redacted by the send that created them. For more information about redacted sends, see .Xr zfs-send 8 . . .feature com.delphix redacted_datasets no extensible_dataset This feature enables the receiving of redacted .Nm zfs Cm send streams, which create redacted datasets when received. These datasets are missing some of their blocks, and so cannot be safely mounted, and their contents cannot be safely read. For more information about redacted receives, see .Xr zfs-send 8 . . .feature com.delphix redaction_list_spill no redaction_bookmarks This feature enables the redaction list created by zfs redact to store many more entries. It becomes .Sy active when a redaction list is created with more than 36 entries, and returns to being .Sy enabled when no long redaction lists remain in the pool. For more information about redacted sends, see .Xr zfs-send 8 . . .feature com.datto resilver_defer yes This feature allows ZFS to postpone new resilvers if an existing one is already in progress. Without this feature, any new resilvers will cause the currently running one to be immediately restarted from the beginning. .Pp This feature becomes .Sy active once a resilver has been deferred, and returns to being .Sy enabled when the deferred resilver begins. . .feature org.illumos sha512 no extensible_dataset This feature enables the use of the SHA-512/256 truncated hash algorithm .Pq FIPS 180-4 for checksum and dedup. The native 64-bit arithmetic of SHA-512 provides an approximate 50% performance boost over SHA-256 on 64-bit hardware and is thus a good minimum-change replacement candidate for systems where hash performance is important, but these systems cannot for whatever reason utilize the faster .Sy skein No and Sy edonr algorithms. .Pp .checksum-spiel sha512 . .feature org.illumos skein no extensible_dataset This feature enables the use of the Skein hash algorithm for checksum and dedup. Skein is a high-performance secure hash algorithm that was a finalist in the NIST SHA-3 competition. It provides a very high security margin and high performance on 64-bit hardware .Pq 80% faster than SHA-256 . This implementation also utilizes the new salted checksumming functionality in ZFS, which means that the checksum is pre-seeded with a secret 256-bit random key .Pq stored on the pool before being fed the data block to be checksummed. Thus the produced checksums are unique to a given pool, preventing hash collision attacks on systems with dedup. .Pp .checksum-spiel skein . .feature com.delphix spacemap_histogram yes This features allows ZFS to maintain more information about how free space is organized within the pool. If this feature is .Sy enabled , it will be activated when a new space map object is created, or an existing space map is upgraded to the new format, and never returns back to being .Sy enabled . . .feature com.delphix spacemap_v2 yes This feature enables the use of the new space map encoding which consists of two words .Pq instead of one whenever it is advantageous. The new encoding allows space maps to represent large regions of space more efficiently on-disk while also increasing their maximum addressable offset. .Pp This feature becomes .Sy active once it is .Sy enabled , and never returns back to being .Sy enabled . . .feature org.zfsonlinux userobj_accounting yes extensible_dataset This feature allows administrators to account the object usage information by user and group. .Pp \*[instant-never] \*[remount-upgrade] . .feature com.klarasystems vdev_zaps_v2 no This feature creates a ZAP object for the root vdev. .Pp This feature becomes active after the next .Nm zpool Cm import or .Nm zpool reguid . . Properties can be retrieved or set on the root vdev using .Nm zpool Cm get and .Nm zpool Cm set with .Sy root as the vdev name which is an alias for .Sy root-0 . .feature org.openzfs zilsaxattr yes extensible_dataset This feature enables .Sy xattr Ns = Ns Sy sa extended attribute logging in the ZIL. If enabled, extended attribute changes .Pq both Sy xattrdir Ns = Ns Sy dir No and Sy xattr Ns = Ns Sy sa are guaranteed to be durable if either the dataset had .Sy sync Ns = Ns Sy always set at the time the changes were made, or .Xr sync 2 is called on the dataset after the changes were made. .Pp This feature becomes .Sy active when a ZIL is created for at least one dataset and will be returned to the .Sy enabled state when it is destroyed for all datasets that use this feature. . .feature com.delphix zpool_checkpoint yes This feature enables the .Nm zpool Cm checkpoint command that can checkpoint the state of the pool at the time it was issued and later rewind back to it or discard it. .Pp This feature becomes .Sy active when the .Nm zpool Cm checkpoint command is used to checkpoint the pool. The feature will only return back to being .Sy enabled when the pool is rewound or the checkpoint has been discarded. . .feature org.freebsd zstd_compress no extensible_dataset .Sy zstd is a high-performance compression algorithm that features a combination of high compression ratios and high speed. Compared to .Sy gzip , .Sy zstd offers slightly better compression at much higher speeds. Compared to .Sy lz4 , .Sy zstd offers much better compression while being only modestly slower. Typically, .Sy zstd compression speed ranges from 250 to 500 MB/s per thread and decompression speed is over 1 GB/s per thread. .Pp When the .Sy zstd feature is set to .Sy enabled , the administrator can turn on .Sy zstd compression of any dataset using .Nm zfs Cm set Sy compress Ns = Ns Sy zstd Ar dset .Po see Xr zfs-set 8 Pc . This feature becomes .Sy active once a .Sy compress property has been set to .Sy zstd , and will return to being .Sy enabled once all filesystems that have ever had their .Sy compress property set to .Sy zstd are destroyed. .El . .Sh SEE ALSO .Xr zfs 8 , .Xr zpool 8 diff --git a/man/man8/zfs-send.8 b/man/man8/zfs-send.8 index ba604bf77855..877d954147b6 100644 --- a/man/man8/zfs-send.8 +++ b/man/man8/zfs-send.8 @@ -1,734 +1,738 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or https://opensource.org/licenses/CDDL-1.0. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2014 by Adam Stevko. All rights reserved. .\" Copyright (c) 2014 Integros [integros.com] .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. +.\" Copyright (c) 2024, Klara, Inc. .\" -.Dd July 27, 2023 +.Dd October 2, 2024 .Dt ZFS-SEND 8 .Os . .Sh NAME .Nm zfs-send .Nd generate backup stream of ZFS dataset .Sh SYNOPSIS .Nm zfs .Cm send .Op Fl DLPVbcehnpsvw .Op Fl R Op Fl X Ar dataset Ns Oo , Ns Ar dataset Oc Ns … .Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot .Ar snapshot .Nm zfs .Cm send .Op Fl DLPVcensvw .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm zfs .Cm send .Fl -redact Ar redaction_bookmark .Op Fl DLPVcenpv .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar snapshot .Nm zfs .Cm send .Op Fl PVenv .Fl t .Ar receive_resume_token .Nm zfs .Cm send .Op Fl PVnv .Fl S Ar filesystem .Nm zfs .Cm redact .Ar snapshot redaction_bookmark .Ar redaction_snapshot Ns … . .Sh DESCRIPTION .Bl -tag -width "" .It Xo .Nm zfs .Cm send .Op Fl DLPVbcehnpsvw .Op Fl R Op Fl X Ar dataset Ns Oo , Ns Ar dataset Oc Ns … .Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot .Ar snapshot .Xc Creates a stream representation of the second .Ar snapshot , which is written to standard output. The output can be redirected to a file or to a different system .Po for example, using .Xr ssh 1 .Pc . By default, a full stream is generated. .Bl -tag -width "-D" .It Fl D , -dedup Deduplicated send is no longer supported. This flag is accepted for backwards compatibility, but a regular, non-deduplicated stream will be generated. .It Fl I Ar snapshot Generate a stream package that sends all intermediary snapshots from the first snapshot to the second snapshot. For example, .Fl I Em @a Em fs@d is similar to .Fl i Em @a Em fs@b Ns \&; Fl i Em @b Em fs@c Ns \&; Fl i Em @c Em fs@d . The incremental source may be specified as with the .Fl i option. .It Fl L , -large-block Generate a stream which may contain blocks larger than 128 KiB. This flag has no effect if the .Sy large_blocks pool feature is disabled, or if the .Sy recordsize property of this filesystem has never been set above 128 KiB. The receiving system must have the .Sy large_blocks pool feature enabled as well. +This flag is required if the +.Sy large_microzap +pool feature is active. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy large_blocks feature. .It Fl P , -parsable Print machine-parsable verbose information about the stream package generated. .It Fl R , -replicate Generate a replication stream package, which will replicate the specified file system, and all descendent file systems, up to the named snapshot. When received, all properties, snapshots, descendent file systems, and clones are preserved. .Pp If the .Fl i or .Fl I flags are used in conjunction with the .Fl R flag, an incremental replication stream is generated. The current values of properties, and current snapshot and file system names are set when the stream is received. If the .Fl F flag is specified when this stream is received, snapshots and file systems that do not exist on the sending side are destroyed. If the .Fl R flag is used to send encrypted datasets, then .Fl w must also be specified. .It Fl V , -proctitle Set the process title to a per-second report of how much data has been sent. .It Fl X , -exclude Ar dataset Ns Oo , Ns Ar dataset Oc Ns … With .Fl R , .Fl X specifies a set of datasets (and, hence, their descendants), to be excluded from the send stream. The root dataset may not be excluded. .Fl X Ar a Fl X Ar b is equivalent to .Fl X Ar a , Ns Ar b . .It Fl e , -embed Generate a more compact stream by using .Sy WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data pool feature. This flag has no effect if the .Sy embedded_data feature is disabled. The receiving system must have the .Sy embedded_data feature enabled. If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. Datasets that are sent with this flag may not be received as an encrypted dataset, since encrypted datasets cannot use the .Sy embedded_data feature. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy embedded_data feature. .It Fl b , -backup Sends only received property values whether or not they are overridden by local settings, but only if the dataset has ever been received. Use this option when you want .Nm zfs Cm receive to restore received properties backed up on the sent dataset and to avoid sending local settings that may have nothing to do with the source dataset, but only with how the data is backed up. .It Fl c , -compressed Generate a more compact stream by using compressed WRITE records for blocks which are compressed on disk and in memory .Po see the .Sy compression property for details .Pc . If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the .Fl L option is not supplied in conjunction with .Fl c , then the data will be decompressed before sending so it can be split into smaller block sizes. Streams sent with .Fl c will not have their data recompressed on the receiver side using .Fl o Sy compress Ns = Ar value . The data will stay compressed as it was from the sender. The new compression property will be set for future data. Note that uncompressed data from the sender will still attempt to compress on the receiver, unless you specify .Fl o Sy compress Ns = Em off . .It Fl w , -raw For encrypted datasets, send data exactly as it exists on disk. This allows backups to be taken even if encryption keys are not currently loaded. The backup may then be received on an untrusted machine since that machine will not have the encryption keys to read the protected data or alter it without being detected. Upon being received, the dataset will have the same encryption keys as it did on the send side, although the .Sy keylocation property will be defaulted to .Sy prompt if not otherwise provided. For unencrypted datasets, this flag will be equivalent to .Fl Lec . Note that if you do not use this flag for sending encrypted datasets, data will be sent unencrypted and may be re-encrypted with a different encryption key on the receiving system, which will disable the ability to do a raw send to that system for incrementals. .It Fl h , -holds Generate a stream package that includes any snapshot holds (created with the .Nm zfs Cm hold command), and indicating to .Nm zfs Cm receive that the holds be applied to the dataset on the receiving system. .It Fl i Ar snapshot Generate an incremental stream from the first .Ar snapshot .Pq the incremental source to the second .Ar snapshot .Pq the incremental target . The incremental source can be specified as the last component of the snapshot name .Po the .Sy @ character and following .Pc and it is assumed to be from the same file system as the incremental target. .Pp If the destination is a clone, the source may be the origin snapshot, which must be fully specified .Po for example, .Em pool/fs@origin , not just .Em @origin .Pc . .It Fl n , -dryrun Do a dry-run .Pq Qq No-op send. Do not generate any actual send data. This is useful in conjunction with the .Fl v or .Fl P flags to determine what data will be sent. In this case, the verbose output will be written to standard output .Po contrast with a non-dry-run, where the stream is written to standard output and the verbose output goes to standard error .Pc . .It Fl p , -props Include the dataset's properties in the stream. This flag is implicit when .Fl R is specified. The receiving system must also support this feature. Sends of encrypted datasets must use .Fl w when using this flag. .It Fl s , -skip-missing Allows sending a replication stream even when there are snapshots missing in the hierarchy. When a snapshot is missing, instead of throwing an error and aborting the send, a warning is printed to the standard error stream and the dataset to which it belongs and its descendents are skipped. This flag can only be used in conjunction with .Fl R . .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. The same report can be requested by sending .Dv SIGINFO or .Dv SIGUSR1 , regardless of .Fl v . .Pp The format of the stream is committed. You will be able to receive your streams on future versions of ZFS. .El .It Xo .Nm zfs .Cm send .Op Fl DLPVcenvw .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc Generate a send stream, which may be of a filesystem, and may be incremental from a bookmark. If the destination is a filesystem or volume, the pool must be read-only, or the filesystem must not be mounted. When the stream generated from a filesystem or volume is received, the default snapshot name will be .Qq --head-- . .Bl -tag -width "-D" .It Fl D , -dedup Deduplicated send is no longer supported. This flag is accepted for backwards compatibility, but a regular, non-deduplicated stream will be generated. .It Fl L , -large-block Generate a stream which may contain blocks larger than 128 KiB. This flag has no effect if the .Sy large_blocks pool feature is disabled, or if the .Sy recordsize property of this filesystem has never been set above 128 KiB. The receiving system must have the .Sy large_blocks pool feature enabled as well. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy large_blocks feature. .It Fl P , -parsable Print machine-parsable verbose information about the stream package generated. .It Fl c , -compressed Generate a more compact stream by using compressed WRITE records for blocks which are compressed on disk and in memory .Po see the .Sy compression property for details .Pc . If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the .Fl L option is not supplied in conjunction with .Fl c , then the data will be decompressed before sending so it can be split into smaller block sizes. .It Fl w , -raw For encrypted datasets, send data exactly as it exists on disk. This allows backups to be taken even if encryption keys are not currently loaded. The backup may then be received on an untrusted machine since that machine will not have the encryption keys to read the protected data or alter it without being detected. Upon being received, the dataset will have the same encryption keys as it did on the send side, although the .Sy keylocation property will be defaulted to .Sy prompt if not otherwise provided. For unencrypted datasets, this flag will be equivalent to .Fl Lec . Note that if you do not use this flag for sending encrypted datasets, data will be sent unencrypted and may be re-encrypted with a different encryption key on the receiving system, which will disable the ability to do a raw send to that system for incrementals. .It Fl e , -embed Generate a more compact stream by using .Sy WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data pool feature. This flag has no effect if the .Sy embedded_data feature is disabled. The receiving system must have the .Sy embedded_data feature enabled. If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. Datasets that are sent with this flag may not be received as an encrypted dataset, since encrypted datasets cannot use the .Sy embedded_data feature. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy embedded_data feature. .It Fl i Ar snapshot Ns | Ns Ar bookmark Generate an incremental send stream. The incremental source must be an earlier snapshot in the destination's history. It will commonly be an earlier snapshot in the destination's file system, in which case it can be specified as the last component of the name .Po the .Sy # or .Sy @ character and following .Pc . .Pp If the incremental target is a clone, the incremental source can be the origin snapshot, or an earlier snapshot in the origin's filesystem, or the origin's origin, etc. .It Fl n , -dryrun Do a dry-run .Pq Qq No-op send. Do not generate any actual send data. This is useful in conjunction with the .Fl v or .Fl P flags to determine what data will be sent. In this case, the verbose output will be written to standard output .Po contrast with a non-dry-run, where the stream is written to standard output and the verbose output goes to standard error .Pc . .It Fl v , -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. The same report can be requested by sending .Dv SIGINFO or .Dv SIGUSR1 , regardless of .Fl v . .El .It Xo .Nm zfs .Cm send .Fl -redact Ar redaction_bookmark .Op Fl DLPVcenpv .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar snapshot .Xc Generate a redacted send stream. This send stream contains all blocks from the snapshot being sent that aren't included in the redaction list contained in the bookmark specified by the .Fl -redact (or .Fl d ) flag. The resulting send stream is said to be redacted with respect to the snapshots the bookmark specified by the .Fl -redact No flag was created with . The bookmark must have been created by running .Nm zfs Cm redact on the snapshot being sent. .Pp This feature can be used to allow clones of a filesystem to be made available on a remote system, in the case where their parent need not (or needs to not) be usable. For example, if a filesystem contains sensitive data, and it has clones where that sensitive data has been secured or replaced with dummy data, redacted sends can be used to replicate the secured data without replicating the original sensitive data, while still sharing all possible blocks. A snapshot that has been redacted with respect to a set of snapshots will contain all blocks referenced by at least one snapshot in the set, but will contain none of the blocks referenced by none of the snapshots in the set. In other words, if all snapshots in the set have modified a given block in the parent, that block will not be sent; but if one or more snapshots have not modified a block in the parent, they will still reference the parent's block, so that block will be sent. Note that only user data will be redacted. .Pp When the redacted send stream is received, we will generate a redacted snapshot. Due to the nature of redaction, a redacted dataset can only be used in the following ways: .Bl -enum -width "a." .It To receive, as a clone, an incremental send from the original snapshot to one of the snapshots it was redacted with respect to. In this case, the stream will produce a valid dataset when received because all blocks that were redacted in the parent are guaranteed to be present in the child's send stream. This use case will produce a normal snapshot, which can be used just like other snapshots. . .It To receive an incremental send from the original snapshot to something redacted with respect to a subset of the set of snapshots the initial snapshot was redacted with respect to. In this case, each block that was redacted in the original is still redacted (redacting with respect to additional snapshots causes less data to be redacted (because the snapshots define what is permitted, and everything else is redacted)). This use case will produce a new redacted snapshot. .It To receive an incremental send from a redaction bookmark of the original snapshot that was created when redacting with respect to a subset of the set of snapshots the initial snapshot was created with respect to anything else. A send stream from such a redaction bookmark will contain all of the blocks necessary to fill in any redacted data, should it be needed, because the sending system is aware of what blocks were originally redacted. This will either produce a normal snapshot or a redacted one, depending on whether the new send stream is redacted. .It To receive an incremental send from a redacted version of the initial snapshot that is redacted with respect to a subject of the set of snapshots the initial snapshot was created with respect to. A send stream from a compatible redacted dataset will contain all of the blocks necessary to fill in any redacted data. This will either produce a normal snapshot or a redacted one, depending on whether the new send stream is redacted. .It To receive a full send as a clone of the redacted snapshot. Since the stream is a full send, it definitionally contains all the data needed to create a new dataset. This use case will either produce a normal snapshot or a redacted one, depending on whether the full send stream was redacted. .El .Pp These restrictions are detected and enforced by .Nm zfs Cm receive ; a redacted send stream will contain the list of snapshots that the stream is redacted with respect to. These are stored with the redacted snapshot, and are used to detect and correctly handle the cases above. Note that for technical reasons, raw sends and redacted sends cannot be combined at this time. .It Xo .Nm zfs .Cm send .Op Fl PVenv .Fl t .Ar receive_resume_token .Xc Creates a send stream which resumes an interrupted receive. The .Ar receive_resume_token is the value of this property on the filesystem or volume that was being received into. See the documentation for .Nm zfs Cm receive Fl s for more details. .It Xo .Nm zfs .Cm send .Op Fl PVnv .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Fl S .Ar filesystem .Xc Generate a send stream from a dataset that has been partially received. .Bl -tag -width "-L" .It Fl S , -saved This flag requires that the specified filesystem previously received a resumable send that did not finish and was interrupted. In such scenarios this flag enables the user to send this partially received state. Using this flag will always use the last fully received snapshot as the incremental source if it exists. .El .It Xo .Nm zfs .Cm redact .Ar snapshot redaction_bookmark .Ar redaction_snapshot Ns … .Xc Generate a new redaction bookmark. In addition to the typical bookmark information, a redaction bookmark contains the list of redacted blocks and the list of redaction snapshots specified. The redacted blocks are blocks in the snapshot which are not referenced by any of the redaction snapshots. These blocks are found by iterating over the metadata in each redaction snapshot to determine what has been changed since the target snapshot. Redaction is designed to support redacted zfs sends; see the entry for .Nm zfs Cm send for more information on the purpose of this operation. If a redact operation fails partway through (due to an error or a system failure), the redaction can be resumed by rerunning the same command. .El .Ss Redaction ZFS has support for a limited version of data subsetting, in the form of redaction. Using the .Nm zfs Cm redact command, a .Sy redaction bookmark can be created that stores a list of blocks containing sensitive information. When provided to .Nm zfs Cm send , this causes a .Sy redacted send to occur. Redacted sends omit the blocks containing sensitive information, replacing them with REDACT records. When these send streams are received, a .Sy redacted dataset is created. A redacted dataset cannot be mounted by default, since it is incomplete. It can be used to receive other send streams. In this way datasets can be used for data backup and replication, with all the benefits that zfs send and receive have to offer, while protecting sensitive information from being stored on less-trusted machines or services. .Pp For the purposes of redaction, there are two steps to the process. A redact step, and a send/receive step. First, a redaction bookmark is created. This is done by providing the .Nm zfs Cm redact command with a parent snapshot, a bookmark to be created, and a number of redaction snapshots. These redaction snapshots must be descendants of the parent snapshot, and they should modify data that is considered sensitive in some way. Any blocks of data modified by all of the redaction snapshots will be listed in the redaction bookmark, because it represents the truly sensitive information. When it comes to the send step, the send process will not send the blocks listed in the redaction bookmark, instead replacing them with REDACT records. When received on the target system, this will create a redacted dataset, missing the data that corresponds to the blocks in the redaction bookmark on the sending system. The incremental send streams from the original parent to the redaction snapshots can then also be received on the target system, and this will produce a complete snapshot that can be used normally. Incrementals from one snapshot on the parent filesystem and another can also be done by sending from the redaction bookmark, rather than the snapshots themselves. .Pp In order to make the purpose of the feature more clear, an example is provided. Consider a zfs filesystem containing four files. These files represent information for an online shopping service. One file contains a list of usernames and passwords, another contains purchase histories, a third contains click tracking data, and a fourth contains user preferences. The owner of this data wants to make it available for their development teams to test against, and their market research teams to do analysis on. The development teams need information about user preferences and the click tracking data, while the market research teams need information about purchase histories and user preferences. Neither needs access to the usernames and passwords. However, because all of this data is stored in one ZFS filesystem, it must all be sent and received together. In addition, the owner of the data wants to take advantage of features like compression, checksumming, and snapshots, so they do want to continue to use ZFS to store and transmit their data. Redaction can help them do so. First, they would make two clones of a snapshot of the data on the source. In one clone, they create the setup they want their market research team to see; they delete the usernames and passwords file, and overwrite the click tracking data with dummy information. In another, they create the setup they want the development teams to see, by replacing the passwords with fake information and replacing the purchase histories with randomly generated ones. They would then create a redaction bookmark on the parent snapshot, using snapshots on the two clones as redaction snapshots. The parent can then be sent, redacted, to the target server where the research and development teams have access. Finally, incremental sends from the parent snapshot to each of the clones can be sent to and received on the target server; these snapshots are identical to the ones on the source, and are ready to be used, while the parent snapshot on the target contains none of the username and password data present on the source, because it was removed by the redacted send operation. . .Sh SIGNALS See .Fl v . . .Sh EXAMPLES .\" These are, respectively, examples 12, 13 from zfs.8 .\" Make sure to update them bidirectionally .Ss Example 1 : No Remotely Replicating ZFS Data The following commands send a full stream and then an incremental stream to a remote machine, restoring them into .Em poolB/received/fs@a and .Em poolB/received/fs@b , respectively. .Em poolB must contain the file system .Em poolB/received , and must not initially contain .Em poolB/received/fs . .Bd -literal -compact -offset Ds .No # Nm zfs Cm send Ar pool/fs@a | .No " " Nm ssh Ar host Nm zfs Cm receive Ar poolB/received/fs Ns @ Ns Ar a .No # Nm zfs Cm send Fl i Ar a pool/fs@b | .No " " Nm ssh Ar host Nm zfs Cm receive Ar poolB/received/fs .Ed . .Ss Example 2 : No Using the Nm zfs Cm receive Fl d No Option The following command sends a full stream of .Ar poolA/fsA/fsB@snap to a remote machine, receiving it into .Ar poolB/received/fsA/fsB@snap . The .Ar fsA/fsB@snap portion of the received snapshot's name is determined from the name of the sent snapshot. .Ar poolB must contain the file system .Ar poolB/received . If .Ar poolB/received/fsA does not exist, it is created as an empty file system. .Bd -literal -compact -offset Ds .No # Nm zfs Cm send Ar poolA/fsA/fsB@snap | .No " " Nm ssh Ar host Nm zfs Cm receive Fl d Ar poolB/received .Ed . .Sh SEE ALSO .Xr zfs-bookmark 8 , .Xr zfs-receive 8 , .Xr zfs-redact 8 , .Xr zfs-snapshot 8 diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 881deb5bf666..96f0086d7858 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -1,786 +1,799 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude */ #ifndef _KERNEL #include #include #include #include #include #endif #include #include #include #include #include #include #include "zfeature_common.h" /* * Set to disable all feature checks while opening pools, allowing pools with * unsupported features to be opened. Set for testing only. */ boolean_t zfeature_checks_disable = B_FALSE; zfeature_info_t spa_feature_table[SPA_FEATURES]; /* * Valid characters for feature guids. This list is mainly for aesthetic * purposes and could be expanded in the future. There are different allowed * characters in the guids reverse dns portion (before the colon) and its * short name (after the colon). */ static int valid_char(char c, boolean_t after_colon) { return ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || (after_colon && c == '_') || (!after_colon && (c == '.' || c == '-'))); } /* * Every feature guid must contain exactly one colon which separates a reverse * dns organization name from the feature's "short" name (e.g. * "com.company:feature_name"). */ boolean_t zfeature_is_valid_guid(const char *name) { int i; boolean_t has_colon = B_FALSE; i = 0; while (name[i] != '\0') { char c = name[i++]; if (c == ':') { if (has_colon) return (B_FALSE); has_colon = B_TRUE; continue; } if (!valid_char(c, has_colon)) return (B_FALSE); } return (has_colon); } boolean_t zfeature_is_supported(const char *guid) { if (zfeature_checks_disable) return (B_TRUE); for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *feature = &spa_feature_table[i]; if (!feature->fi_zfs_mod_supported) continue; if (strcmp(guid, feature->fi_guid) == 0) return (B_TRUE); } return (B_FALSE); } int zfeature_lookup_guid(const char *guid, spa_feature_t *res) { for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *feature = &spa_feature_table[i]; if (!feature->fi_zfs_mod_supported) continue; if (strcmp(guid, feature->fi_guid) == 0) { if (res != NULL) *res = i; return (0); } } return (ENOENT); } int zfeature_lookup_name(const char *name, spa_feature_t *res) { for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *feature = &spa_feature_table[i]; if (!feature->fi_zfs_mod_supported) continue; if (strcmp(name, feature->fi_uname) == 0) { if (res != NULL) *res = i; return (0); } } return (ENOENT); } boolean_t zfeature_depends_on(spa_feature_t fid, spa_feature_t check) { zfeature_info_t *feature = &spa_feature_table[fid]; for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) { if (feature->fi_depends[i] == check) return (B_TRUE); } return (B_FALSE); } static boolean_t deps_contains_feature(const spa_feature_t *deps, const spa_feature_t feature) { for (int i = 0; deps[i] != SPA_FEATURE_NONE; i++) if (deps[i] == feature) return (B_TRUE); return (B_FALSE); } #define STRCMP ((int(*)(const void *, const void *))&strcmp) struct zfs_mod_supported_features { void *tree; boolean_t all_features; }; struct zfs_mod_supported_features * zfs_mod_list_supported(const char *scope) { #if defined(__FreeBSD__) || defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) (void) scope; return (NULL); #else struct zfs_mod_supported_features *ret = calloc(1, sizeof (*ret)); if (ret == NULL) return (NULL); DIR *sysfs_dir = NULL; char path[128]; if (snprintf(path, sizeof (path), "%s/%s", ZFS_SYSFS_DIR, scope) < sizeof (path)) sysfs_dir = opendir(path); if (sysfs_dir == NULL && errno == ENOENT) { if (snprintf(path, sizeof (path), "%s/%s", ZFS_SYSFS_ALT_DIR, scope) < sizeof (path)) sysfs_dir = opendir(path); } if (sysfs_dir == NULL) { ret->all_features = errno == ENOENT && (access(ZFS_SYSFS_DIR, F_OK) == 0 || access(ZFS_SYSFS_ALT_DIR, F_OK) == 0); return (ret); } struct dirent *node; while ((node = readdir(sysfs_dir)) != NULL) { if (strcmp(node->d_name, ".") == 0 || strcmp(node->d_name, "..") == 0) continue; char *name = strdup(node->d_name); if (name == NULL) { goto nomem; } if (tsearch(name, &ret->tree, STRCMP) == NULL) { /* * Don't bother checking for duplicate entries: * we're iterating a single directory. */ free(name); goto nomem; } } end: closedir(sysfs_dir); return (ret); nomem: zfs_mod_list_supported_free(ret); ret = NULL; goto end; #endif } void zfs_mod_list_supported_free(struct zfs_mod_supported_features *list) { #if !defined(__FreeBSD__) && !defined(_KERNEL) && !defined(LIB_ZPOOL_BUILD) if (list) { tdestroy(list->tree, free); free(list); } #else (void) list; #endif } #if !defined(_KERNEL) && !defined(LIB_ZPOOL_BUILD) static boolean_t zfs_mod_supported_impl(const char *scope, const char *name, const char *sysfs) { char path[128]; if (snprintf(path, sizeof (path), "%s%s%s%s%s", sysfs, scope == NULL ? "" : "/", scope ?: "", name == NULL ? "" : "/", name ?: "") < sizeof (path)) return (access(path, F_OK) == 0); else return (B_FALSE); } boolean_t zfs_mod_supported(const char *scope, const char *name, const struct zfs_mod_supported_features *sfeatures) { boolean_t supported; if (sfeatures != NULL) return (sfeatures->all_features || tfind(name, &sfeatures->tree, STRCMP)); /* * Check both the primary and alternate sysfs locations to determine * if the required functionality is supported. */ supported = (zfs_mod_supported_impl(scope, name, ZFS_SYSFS_DIR) || zfs_mod_supported_impl(scope, name, ZFS_SYSFS_ALT_DIR)); /* * For backwards compatibility with kernel modules that predate * supported feature/property checking. Report the feature/property * as supported if the kernel module is loaded but the requested * scope directory does not exist. */ if (supported == B_FALSE) { if ((access(ZFS_SYSFS_DIR, F_OK) == 0 && !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_DIR)) || (access(ZFS_SYSFS_ALT_DIR, F_OK) == 0 && !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_ALT_DIR))) { supported = B_TRUE; } } return (supported); } #endif static boolean_t zfs_mod_supported_feature(const char *name, const struct zfs_mod_supported_features *sfeatures) { /* * The zfs module spa_feature_table[], whether in-kernel or in * libzpool, always supports all the features. libzfs needs to * query the running module, via sysfs, to determine which * features are supported. * * The equivalent _can_ be done on FreeBSD by way of the sysctl * tree, but this has not been done yet. Therefore, we return * that all features are supported. */ #if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || defined(__FreeBSD__) (void) name, (void) sfeatures; return (B_TRUE); #else return (zfs_mod_supported(ZFS_SYSFS_POOL_FEATURES, name, sfeatures)); #endif } static void zfeature_register(spa_feature_t fid, const char *guid, const char *name, const char *desc, zfeature_flags_t flags, zfeature_type_t type, const spa_feature_t *deps, const struct zfs_mod_supported_features *sfeatures) { zfeature_info_t *feature = &spa_feature_table[fid]; static const spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; ASSERT(name != NULL); ASSERT(desc != NULL); ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 || (flags & ZFEATURE_FLAG_MOS) == 0); ASSERT3U(fid, <, SPA_FEATURES); ASSERT(zfeature_is_valid_guid(guid)); if (deps == NULL) deps = nodeps; VERIFY(((flags & ZFEATURE_FLAG_PER_DATASET) == 0) || (deps_contains_feature(deps, SPA_FEATURE_EXTENSIBLE_DATASET))); feature->fi_feature = fid; feature->fi_guid = guid; feature->fi_uname = name; feature->fi_desc = desc; feature->fi_flags = flags; feature->fi_type = type; feature->fi_depends = deps; feature->fi_zfs_mod_supported = zfs_mod_supported_feature(guid, sfeatures); } /* * Every feature has a GUID of the form com.example:feature_name. The * reversed DNS name ensures that the feature's GUID is unique across all ZFS * implementations. This allows companies to independently develop and * release features. Examples include org.delphix and org.datto. Previously, * features developed on one implementation have used that implementation's * domain name (e.g. org.illumos and org.zfsonlinux). Use of the org.openzfs * domain name is recommended for new features which are developed by the * OpenZFS community and its platforms. This domain may optionally be used by * companies developing features for initial release through an OpenZFS * implementation. Use of the org.openzfs domain requires reserving the * feature name in advance with the OpenZFS project. */ void zpool_feature_init(void) { struct zfs_mod_supported_features *sfeatures = zfs_mod_list_supported(ZFS_SYSFS_POOL_FEATURES); zfeature_register(SPA_FEATURE_ASYNC_DESTROY, "com.delphix:async_destroy", "async_destroy", "Destroy filesystems asynchronously.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_EMPTY_BPOBJ, "com.delphix:empty_bpobj", "empty_bpobj", "Snapshots use less space.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_LZ4_COMPRESS, "org.illumos:lz4_compress", "lz4_compress", "LZ4 compression algorithm support.", ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump", "Crash dumps to multiple vdev pools.", 0, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, "com.delphix:spacemap_histogram", "spacemap_histogram", "Spacemaps maintain space histograms.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_ENABLED_TXG, "com.delphix:enabled_txg", "enabled_txg", "Record txg at which a feature is enabled", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_HOLE_BIRTH, "com.delphix:hole_birth", "hole_birth", "Retain hole birth txg for more precise zfs send", ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, hole_birth_deps, sfeatures); } zfeature_register(SPA_FEATURE_POOL_CHECKPOINT, "com.delphix:zpool_checkpoint", "zpool_checkpoint", "Pool state can be checkpointed, allowing rewind later.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_SPACEMAP_V2, "com.delphix:spacemap_v2", "spacemap_v2", "Space maps representing large segments are more efficient.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, "com.delphix:extensible_dataset", "extensible_dataset", "Enhanced dataset functionality, used by other features.", 0, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t bookmarks_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_BOOKMARKS, "com.delphix:bookmarks", "bookmarks", "\"zfs bookmark\" command", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, bookmarks_deps, sfeatures); } { static const spa_feature_t filesystem_limits_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_FS_SS_LIMIT, "com.joyent:filesystem_limits", "filesystem_limits", "Filesystem and snapshot limits.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, filesystem_limits_deps, sfeatures); } zfeature_register(SPA_FEATURE_EMBEDDED_DATA, "com.delphix:embedded_data", "embedded_data", "Blocks which compress very well use even less space.", ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t livelist_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LIVELIST, "com.delphix:livelist", "livelist", "Improved clone deletion performance.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, livelist_deps, sfeatures); } { static const spa_feature_t log_spacemap_deps[] = { SPA_FEATURE_SPACEMAP_V2, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LOG_SPACEMAP, "com.delphix:log_spacemap", "log_spacemap", "Log metaslab changes on a single spacemap and " "flush them periodically.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, log_spacemap_deps, sfeatures); } { static const spa_feature_t large_blocks_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LARGE_BLOCKS, "org.open-zfs:large_blocks", "large_blocks", "Support for blocks larger than 128KB.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, large_blocks_deps, sfeatures); } { static const spa_feature_t large_dnode_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LARGE_DNODE, "org.zfsonlinux:large_dnode", "large_dnode", "Variable on-disk size of dnodes.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, large_dnode_deps, sfeatures); } { static const spa_feature_t sha512_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_SHA512, "org.illumos:sha512", "sha512", "SHA-512/256 hash algorithm.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, sha512_deps, sfeatures); } { static const spa_feature_t skein_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_SKEIN, "org.illumos:skein", "skein", "Skein hash algorithm.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, skein_deps, sfeatures); } { static const spa_feature_t edonr_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_EDONR, "org.illumos:edonr", "edonr", "Edon-R hash algorithm.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, edonr_deps, sfeatures); } { static const spa_feature_t redact_books_deps[] = { SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_REDACTION_BOOKMARKS, "com.delphix:redaction_bookmarks", "redaction_bookmarks", "Support for bookmarks which store redaction lists for zfs " "redacted send/recv.", 0, ZFEATURE_TYPE_BOOLEAN, redact_books_deps, sfeatures); } { static const spa_feature_t redact_datasets_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_REDACTED_DATASETS, "com.delphix:redacted_datasets", "redacted_datasets", "Support for redacted datasets, produced by receiving " "a redacted zfs send stream.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_UINT64_ARRAY, redact_datasets_deps, sfeatures); } { static const spa_feature_t bookmark_written_deps[] = { SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_BOOKMARK_WRITTEN, "com.delphix:bookmark_written", "bookmark_written", "Additional accounting, enabling the written# " "property (space written since a bookmark), " "and estimates of send stream sizes for incrementals from " "bookmarks.", 0, ZFEATURE_TYPE_BOOLEAN, bookmark_written_deps, sfeatures); } zfeature_register(SPA_FEATURE_DEVICE_REMOVAL, "com.delphix:device_removal", "device_removal", "Top-level vdevs can be removed, reducing logical pool size.", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t obsolete_counts_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_DEVICE_REMOVAL, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS, "com.delphix:obsolete_counts", "obsolete_counts", "Reduce memory used by removed devices when their blocks " "are freed or remapped.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, obsolete_counts_deps, sfeatures); } { static const spa_feature_t userobj_accounting_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_USEROBJ_ACCOUNTING, "org.zfsonlinux:userobj_accounting", "userobj_accounting", "User/Group object accounting.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, userobj_accounting_deps, sfeatures); } { static const spa_feature_t bookmark_v2_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_BOOKMARK_V2, "com.datto:bookmark_v2", "bookmark_v2", "Support for larger bookmarks", 0, ZFEATURE_TYPE_BOOLEAN, bookmark_v2_deps, sfeatures); } { static const spa_feature_t encryption_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_ENCRYPTION, "com.datto:encryption", "encryption", "Support for dataset level encryption", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, encryption_deps, sfeatures); } { static const spa_feature_t project_quota_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_PROJECT_QUOTA, "org.zfsonlinux:project_quota", "project_quota", "space/object accounting based on project ID.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, project_quota_deps, sfeatures); } zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES, "org.zfsonlinux:allocation_classes", "allocation_classes", "Support for separate allocation classes.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_RESILVER_DEFER, "com.datto:resilver_defer", "resilver_defer", "Support for deferring new resilvers when one is already running.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_DEVICE_REBUILD, "org.openzfs:device_rebuild", "device_rebuild", "Support for sequential mirror/dRAID device rebuilds", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t zstd_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_ZSTD_COMPRESS, "org.freebsd:zstd_compress", "zstd_compress", "zstd compression algorithm support.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps, sfeatures); } zfeature_register(SPA_FEATURE_DRAID, "org.openzfs:draid", "draid", "Support for distributed spare RAID", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t zilsaxattr_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_ZILSAXATTR, "org.openzfs:zilsaxattr", "zilsaxattr", "Support for xattr=sa extended attribute logging in ZIL.", ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures); } zfeature_register(SPA_FEATURE_HEAD_ERRLOG, "com.delphix:head_errlog", "head_errlog", "Support for per-dataset on-disk error logs.", ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t blake3_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_BLAKE3, "org.openzfs:blake3", "blake3", "BLAKE3 hash algorithm.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, blake3_deps, sfeatures); } zfeature_register(SPA_FEATURE_BLOCK_CLONING, "com.fudosecurity:block_cloning", "block_cloning", "Support for block cloning via Block Reference Table.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_AVZ_V2, "com.klarasystems:vdev_zaps_v2", "vdev_zaps_v2", "Support for root vdev ZAP.", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t redact_list_spill_deps[] = { SPA_FEATURE_REDACTION_BOOKMARKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_REDACTION_LIST_SPILL, "com.delphix:redaction_list_spill", "redaction_list_spill", "Support for increased number of redaction_snapshot " "arguments in zfs redact.", 0, ZFEATURE_TYPE_BOOLEAN, redact_list_spill_deps, sfeatures); } zfeature_register(SPA_FEATURE_RAIDZ_EXPANSION, "org.openzfs:raidz_expansion", "raidz_expansion", "Support for raidz expansion", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_FAST_DEDUP, "com.klarasystems:fast_dedup", "fast_dedup", "Support for advanced deduplication", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t longname_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LONGNAME, "org.zfsonlinux:longname", "longname", "support filename up to 1024 bytes", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, longname_deps, sfeatures); } + { + static const spa_feature_t large_microzap_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_LARGE_BLOCKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LARGE_MICROZAP, + "com.klarasystems:large_microzap", "large_microzap", + "Support for microzaps larger than 128KB.", + ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT, + ZFEATURE_TYPE_BOOLEAN, large_microzap_deps, sfeatures); + } + zfs_mod_list_supported_free(sfeatures); } #if defined(_KERNEL) EXPORT_SYMBOL(zfeature_lookup_guid); EXPORT_SYMBOL(zfeature_lookup_name); EXPORT_SYMBOL(zfeature_is_supported); EXPORT_SYMBOL(zfeature_is_valid_guid); EXPORT_SYMBOL(zfeature_depends_on); EXPORT_SYMBOL(zpool_feature_init); EXPORT_SYMBOL(spa_feature_table); #endif diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 4877eb7e62f8..b1cd981cec1d 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1,3825 +1,3846 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019 Datto Inc. * Copyright (c) 2022 Axcient. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif #include static uint_t zfs_recv_queue_length = SPA_MAXBLOCKSIZE; static uint_t zfs_recv_queue_ff = 20; static uint_t zfs_recv_write_batch_size = 1024 * 1024; static int zfs_recv_best_effort_corrective = 0; static const void *const dmu_recv_tag = "dmu_recv_tag"; const char *const recv_clone_name = "%recv"; typedef enum { ORNS_NO, ORNS_YES, ORNS_MAYBE } or_need_sync_t; static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len, void *buf); struct receive_record_arg { dmu_replay_record_t header; void *payload; /* Pointer to a buffer containing the payload */ /* * If the record is a WRITE or SPILL, pointer to the abd containing the * payload. */ abd_t *abd; int payload_size; uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ bqueue_node_t node; }; struct receive_writer_arg { objset_t *os; boolean_t byteswap; bqueue_t q; /* * These three members are used to signal to the main thread when * we're done. */ kmutex_t mutex; kcondvar_t cv; boolean_t done; int err; const char *tofs; boolean_t heal; boolean_t resumable; boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ boolean_t full; /* this is a full send stream */ uint64_t last_object; uint64_t last_offset; uint64_t max_object; /* highest object ID referenced in stream */ uint64_t bytes_read; /* bytes read when current record created */ list_t write_batch; /* Encryption parameters for the last received DRR_OBJECT_RANGE */ boolean_t or_crypt_params_present; uint64_t or_firstobj; uint64_t or_numslots; uint8_t or_salt[ZIO_DATA_SALT_LEN]; uint8_t or_iv[ZIO_DATA_IV_LEN]; uint8_t or_mac[ZIO_DATA_MAC_LEN]; boolean_t or_byteorder; zio_t *heal_pio; /* Keep track of DRR_FREEOBJECTS right after DRR_OBJECT_RANGE */ or_need_sync_t or_need_sync; }; typedef struct dmu_recv_begin_arg { const char *drba_origin; dmu_recv_cookie_t *drba_cookie; cred_t *drba_cred; proc_t *drba_proc; dsl_crypto_params_t *drba_dcp; } dmu_recv_begin_arg_t; static void byteswap_record(dmu_replay_record_t *drr) { #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); DO64(drr_begin.drr_versioninfo); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); DO32(drr_begin.drr_flags); DO64(drr_begin.drr_toguid); DO64(drr_begin.drr_fromguid); break; case DRR_OBJECT: DO64(drr_object.drr_object); DO32(drr_object.drr_type); DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); DO32(drr_object.drr_bonuslen); DO32(drr_object.drr_raw_bonuslen); DO64(drr_object.drr_toguid); DO64(drr_object.drr_maxblkid); break; case DRR_FREEOBJECTS: DO64(drr_freeobjects.drr_firstobj); DO64(drr_freeobjects.drr_numobjs); DO64(drr_freeobjects.drr_toguid); break; case DRR_WRITE: DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); DO64(drr_write.drr_logical_size); DO64(drr_write.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); DO64(drr_write.drr_key.ddk_prop); DO64(drr_write.drr_compressed_size); break; case DRR_WRITE_EMBEDDED: DO64(drr_write_embedded.drr_object); DO64(drr_write_embedded.drr_offset); DO64(drr_write_embedded.drr_length); DO64(drr_write_embedded.drr_toguid); DO32(drr_write_embedded.drr_lsize); DO32(drr_write_embedded.drr_psize); break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); DO64(drr_free.drr_length); DO64(drr_free.drr_toguid); break; case DRR_SPILL: DO64(drr_spill.drr_object); DO64(drr_spill.drr_length); DO64(drr_spill.drr_toguid); DO64(drr_spill.drr_compressed_size); DO32(drr_spill.drr_type); break; case DRR_OBJECT_RANGE: DO64(drr_object_range.drr_firstobj); DO64(drr_object_range.drr_numslots); DO64(drr_object_range.drr_toguid); break; case DRR_REDACT: DO64(drr_redact.drr_object); DO64(drr_redact.drr_offset); DO64(drr_redact.drr_length); DO64(drr_redact.drr_toguid); break; case DRR_END: DO64(drr_end.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); break; default: break; } if (drr->drr_type != DRR_BEGIN) { ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); } #undef DO64 #undef DO32 } static boolean_t redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid) { for (int i = 0; i < num_snaps; i++) { if (snaps[i] == guid) return (B_TRUE); } return (B_FALSE); } /* * Check that the new stream we're trying to receive is redacted with respect to * a subset of the snapshots that the origin was redacted with respect to. For * the reasons behind this, see the man page on redacted zfs sends and receives. */ static boolean_t compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps, uint64_t *redact_snaps, uint64_t num_redact_snaps) { /* * Short circuit the comparison; if we are redacted with respect to * more snapshots than the origin, we can't be redacted with respect * to a subset. */ if (num_redact_snaps > origin_num_snaps) { return (B_FALSE); } for (int i = 0; i < num_redact_snaps; i++) { if (!redact_snaps_contains(origin_snaps, origin_num_snaps, redact_snaps[i])) { return (B_FALSE); } } return (B_TRUE); } static boolean_t redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin) { uint64_t *origin_snaps; uint64_t origin_num_snaps; dmu_recv_cookie_t *drc = drba->drba_cookie; struct drr_begin *drrb = drc->drc_drrb; int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); int err = 0; boolean_t ret = B_TRUE; uint64_t *redact_snaps; uint_t numredactsnaps; /* * If this is a full send stream, we're safe no matter what. */ if (drrb->drr_fromguid == 0) return (ret); VERIFY(dsl_dataset_get_uint64_array_feature(origin, SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps)); if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) == 0) { /* * If the send stream was sent from the redaction bookmark or * the redacted version of the dataset, then we're safe. Verify * that this is from the a compatible redaction bookmark or * redacted dataset. */ if (!compatible_redact_snaps(origin_snaps, origin_num_snaps, redact_snaps, numredactsnaps)) { err = EINVAL; } } else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { /* * If the stream is redacted, it must be redacted with respect * to a subset of what the origin is redacted with respect to. * See case number 2 in the zfs man page section on redacted zfs * send. */ err = nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps); if (err != 0 || !compatible_redact_snaps(origin_snaps, origin_num_snaps, redact_snaps, numredactsnaps)) { err = EINVAL; } } else if (!redact_snaps_contains(origin_snaps, origin_num_snaps, drrb->drr_toguid)) { /* * If the stream isn't redacted but the origin is, this must be * one of the snapshots the origin is redacted with respect to. * See case number 1 in the zfs man page section on redacted zfs * send. */ err = EINVAL; } if (err != 0) ret = B_FALSE; return (ret); } /* * If we previously received a stream with --large-block, we don't support * receiving an incremental on top of it without --large-block. This avoids * forcing a read-modify-write or trying to re-aggregate a string of WRITE * records. */ static int recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags) { if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) && !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH)); return (0); } static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) { uint64_t obj; uint64_t children; int error; dsl_dataset_t *snap; dsl_pool_t *dp = ds->ds_dir->dd_pool; boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0; boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0; /* Temporary clone name must not exist. */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 8, 1, &obj); if (error != ENOENT) return (error == 0 ? SET_ERROR(EBUSY) : error); /* Resume state must not be set. */ if (dsl_dataset_has_resume_receive_state(ds)) return (SET_ERROR(EBUSY)); /* New snapshot name must not exist if we're not healing it. */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 8, 1, &obj); if (drba->drba_cookie->drc_heal) { if (error != 0) return (error); } else if (error != ENOENT) { return (error == 0 ? SET_ERROR(EEXIST) : error); } /* Must not have children if receiving a ZVOL. */ error = zap_count(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children); if (error != 0) return (error); if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS && children > 0) return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); /* * Check snapshot limit before receiving. We'll recheck again at the * end, but might as well abort before receiving if we're already over * the limit. * * Note that we do not check the file system limit with * dsl_dir_fscount_check because the temporary %clones don't count * against that limit. */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred, drba->drba_proc); if (error != 0) return (error); if (drba->drba_cookie->drc_heal) { /* Encryption is incompatible with embedded data. */ if (encrypted && embed) return (SET_ERROR(EINVAL)); /* Healing is not supported when in 'force' mode. */ if (drba->drba_cookie->drc_force) return (SET_ERROR(EINVAL)); /* Must have keys loaded if doing encrypted non-raw recv. */ if (encrypted && !raw) { if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object, NULL, NULL) != 0) return (SET_ERROR(EACCES)); } error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) return (error); /* * When not doing best effort corrective recv healing can only * be done if the send stream is for the same snapshot as the * one we are trying to heal. */ if (zfs_recv_best_effort_corrective == 0 && drba->drba_cookie->drc_drrb->drr_toguid != dsl_dataset_phys(snap)->ds_guid) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ENOTSUP)); } dsl_dataset_rele(snap, FTAG); } else if (fromguid != 0) { /* Sanity check the incremental recv */ uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; /* Can't perform a raw receive on top of a non-raw receive */ if (!encrypted && raw) return (SET_ERROR(EINVAL)); /* Encryption is incompatible with embedded data */ if (encrypted && embed) return (SET_ERROR(EINVAL)); /* Find snapshot in this dir that matches fromguid. */ while (obj != 0) { error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) return (SET_ERROR(ENODEV)); if (snap->ds_dir != ds->ds_dir) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ENODEV)); } if (dsl_dataset_phys(snap)->ds_guid == fromguid) break; obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); } if (obj == 0) return (SET_ERROR(ENODEV)); if (drba->drba_cookie->drc_force) { drba->drba_cookie->drc_fromsnapobj = obj; } else { /* * If we are not forcing, there must be no * changes since fromsnap. Raw sends have an * additional constraint that requires that * no "noop" snapshots exist between fromsnap * and tosnap for the IVset checking code to * work properly. */ if (dsl_dataset_modified_since_snap(ds, snap) || (raw && dsl_dataset_phys(ds)->ds_prev_snap_obj != snap->ds_object)) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ETXTBSY)); } drba->drba_cookie->drc_fromsnapobj = ds->ds_prev->ds_object; } if (dsl_dataset_feature_is_active(snap, SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba, snap)) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(EINVAL)); } error = recv_check_large_blocks(snap, featureflags); if (error != 0) { dsl_dataset_rele(snap, FTAG); return (error); } dsl_dataset_rele(snap, FTAG); } else { /* If full and not healing then must be forced. */ if (!drba->drba_cookie->drc_force) return (SET_ERROR(EEXIST)); /* * We don't support using zfs recv -F to blow away * encrypted filesystems. This would require the * dsl dir to point to the old encryption key and * the new one at the same time during the receive. */ if ((!encrypted && raw) || encrypted) return (SET_ERROR(EINVAL)); /* * Perform the same encryption checks we would if * we were creating a new dataset from scratch. */ if (!raw) { boolean_t will_encrypt; error = dmu_objset_create_crypt_check( ds->ds_dir->dd_parent, drba->drba_dcp, &will_encrypt); if (error != 0) return (error); if (will_encrypt && embed) return (SET_ERROR(EINVAL)); } } return (0); } /* * Check that any feature flags used in the data stream we're receiving are * supported by the pool we are receiving into. * * Note that some of the features we explicitly check here have additional * (implicit) features they depend on, but those dependencies are enforced * through the zfeature_register() calls declaring the features that we * explicitly check. */ static int recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa) { /* * Check if there are any unsupported feature flags. */ if (!DMU_STREAM_SUPPORTED(featureflags)) { return (SET_ERROR(ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE)); } /* Verify pool version supports SA if SA_SPILL feature set */ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && spa_version(spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); /* * LZ4 compressed, ZSTD compressed, embedded, mooched, large blocks, * and large_dnodes in the stream can only be used if those pool * features are enabled because we don't attempt to decompress / * un-embed / un-mooch / split up the blocks / dnodes during the * receive process. */ if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_ZSTD) && !spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP)) + return (SET_ERROR(ENOTSUP)); /* * Receiving redacted streams requires that redacted datasets are * enabled. */ if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) && !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS)) return (SET_ERROR(ENOTSUP)); /* * If the LONGNAME is not enabled on the target, fail that request. */ if ((featureflags & DMU_BACKUP_FEATURE_LONGNAME) && !spa_feature_is_enabled(spa, SPA_FEATURE_LONGNAME)) return (SET_ERROR(ENOTSUP)); return (0); } static int dmu_recv_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES || ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) return (SET_ERROR(EINVAL)); error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa); if (error != 0) return (error); /* Resumable receives require extensible datasets */ if (drba->drba_cookie->drc_resumable && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) return (SET_ERROR(ENOTSUP)); if (featureflags & DMU_BACKUP_FEATURE_RAW) { /* raw receives require the encryption feature */ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) return (SET_ERROR(ENOTSUP)); /* embedded data is incompatible with encryption and raw recv */ if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) return (SET_ERROR(EINVAL)); /* raw receives require spill block allocation flag */ if (!(flags & DRR_FLAG_SPILL_BLOCK)) return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); } else { /* * We support unencrypted datasets below encrypted ones now, * so add the DS_HOLD_FLAG_DECRYPT flag only if we are dealing * with a dataset we may encrypt. */ if (drba->drba_dcp == NULL || drba->drba_dcp->cp_crypt != ZIO_CRYPT_OFF) { dsflags |= DS_HOLD_FLAG_DECRYPT; } } error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error == 0) { /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ if (flags & DRR_FLAG_CLONE || drba->drba_origin) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } error = recv_begin_check_existing_impl(drba, ds, fromguid, featureflags); dsl_dataset_rele_flags(ds, dsflags, FTAG); } else if (error == ENOENT) { /* target fs does not exist; must be a full backup or clone */ char buf[ZFS_MAX_DATASET_NAME_LEN]; objset_t *os; /* healing recv must be done "into" an existing snapshot */ if (drba->drba_cookie->drc_heal == B_TRUE) return (SET_ERROR(ENOTSUP)); /* * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) || drba->drba_origin)) return (SET_ERROR(ENOENT)); /* * If we're receiving a full send as a clone, and it doesn't * contain all the necessary free records and freeobject * records, reject it. */ if (fromguid == 0 && drba->drba_origin != NULL && !(flags & DRR_FLAG_FREERECORDS)) return (SET_ERROR(EINVAL)); /* Open the parent of tofs */ ASSERT3U(strlen(tofs), <, sizeof (buf)); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); error = dsl_dataset_hold(dp, buf, FTAG, &ds); if (error != 0) return (error); if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && drba->drba_origin == NULL) { boolean_t will_encrypt; /* * Check that we aren't breaking any encryption rules * and that we have all the parameters we need to * create an encrypted dataset if necessary. If we are * making an encrypted dataset the stream can't have * embedded data. */ error = dmu_objset_create_crypt_check(ds->ds_dir, drba->drba_dcp, &will_encrypt); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (will_encrypt && (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } } /* * Check filesystem and snapshot limits before receiving. We'll * recheck snapshot limits again at the end (we create the * filesystems and increment those counts during begin_sync). */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred, drba->drba_proc); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred, drba->drba_proc); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } /* can't recv below anything but filesystems (eg. no ZVOLs) */ error = dmu_objset_from_ds(ds, &os); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (dmu_objset_type(os) != DMU_OST_ZFS) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); } if (drba->drba_origin != NULL) { dsl_dataset_t *origin; error = dsl_dataset_hold_flags(dp, drba->drba_origin, dsflags, FTAG, &origin); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (!origin->ds_is_snapshot) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } if (dsl_dataset_phys(origin)->ds_guid != fromguid && fromguid != 0) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); } if (origin->ds_dir->dd_crypto_obj != 0 && (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* * If the origin is redacted we need to verify that this * send stream can safely be received on top of the * origin. */ if (dsl_dataset_feature_is_active(origin, SPA_FEATURE_REDACTED_DATASETS)) { if (!redact_check(drba, origin)) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } } error = recv_check_large_blocks(ds, featureflags); if (error != 0) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); return (error); } dsl_dataset_rele_flags(origin, dsflags, FTAG); } dsl_dataset_rele(ds, FTAG); error = 0; } return (error); } static void dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; dmu_recv_cookie_t *drc = drba->drba_cookie; struct drr_begin *drrb = drc->drc_drrb; const char *tofs = drc->drc_tofs; uint64_t featureflags = drc->drc_featureflags; dsl_dataset_t *ds, *newds; objset_t *os; uint64_t dsobj; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t crflags = 0; dsl_crypto_params_t dummy_dcp = { 0 }; dsl_crypto_params_t *dcp = drba->drba_dcp; if (drrb->drr_flags & DRR_FLAG_CI_DATA) crflags |= DS_FLAG_CI_DATASET; if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0) dsflags |= DS_HOLD_FLAG_DECRYPT; /* * Raw, non-incremental recvs always use a dummy dcp with * the raw cmd set. Raw incremental recvs do not use a dcp * since the encryption parameters are already set in stone. */ if (dcp == NULL && drrb->drr_fromguid == 0 && drba->drba_origin == NULL) { ASSERT3P(dcp, ==, NULL); dcp = &dummy_dcp; if (featureflags & DMU_BACKUP_FEATURE_RAW) dcp->cp_cmd = DCP_CMD_RAW_RECV; } error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error == 0) { /* Create temporary clone unless we're doing corrective recv */ dsl_dataset_t *snap = NULL; if (drba->drba_cookie->drc_fromsnapobj != 0) { VERIFY0(dsl_dataset_hold_obj(dp, drba->drba_cookie->drc_fromsnapobj, FTAG, &snap)); ASSERT3P(dcp, ==, NULL); } if (drc->drc_heal) { /* When healing we want to use the provided snapshot */ VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap, &dsobj)); } else { dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, snap, crflags, drba->drba_cred, dcp, tx); } if (drba->drba_cookie->drc_fromsnapobj != 0) dsl_dataset_rele(snap, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); } else { dsl_dir_t *dd; const char *tail; dsl_dataset_t *origin = NULL; VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); if (drba->drba_origin != NULL) { VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin)); ASSERT3P(dcp, ==, NULL); } /* Create new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1, origin, crflags, drba->drba_cred, dcp, tx); if (origin != NULL) dsl_dataset_rele(origin, FTAG); dsl_dir_rele(dd, FTAG); drc->drc_newfs = B_TRUE; } VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag, &newds)); if (dsl_dataset_feature_is_active(newds, SPA_FEATURE_REDACTED_DATASETS)) { /* * If the origin dataset is redacted, the child will be redacted * when we create it. We clear the new dataset's * redaction info; if it should be redacted, we'll fill * in its information later. */ dsl_dataset_deactivate_feature(newds, SPA_FEATURE_REDACTED_DATASETS, tx); } VERIFY0(dmu_objset_from_ds(newds, &os)); if (drc->drc_resumable) { dsl_dataset_zapify(newds, tx); if (drrb->drr_fromguid != 0) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, 8, 1, &drrb->drr_fromguid, tx)); } VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, 8, 1, &drrb->drr_toguid, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); uint64_t one = 1; uint64_t zero = 0; VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, 8, 1, &one, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, 8, 1, &zero, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, 8, 1, &zero, tx)); if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, 8, 1, &one, tx)); } if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, 8, 1, &one, tx)); } if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, 8, 1, &one, tx)); } if (featureflags & DMU_BACKUP_FEATURE_RAW) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK, 8, 1, &one, tx)); } uint64_t *redact_snaps; uint_t numredactsnaps; if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) == 0) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, sizeof (*redact_snaps), numredactsnaps, redact_snaps, tx)); } } /* * Usually the os->os_encrypted value is tied to the presence of a * DSL Crypto Key object in the dd. However, that will not be received * until dmu_recv_stream(), so we set the value manually for now. */ if (featureflags & DMU_BACKUP_FEATURE_RAW) { os->os_encrypted = B_TRUE; drba->drba_cookie->drc_raw = B_TRUE; } if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { uint64_t *redact_snaps; uint_t numredactsnaps; VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps)); dsl_dataset_activate_redaction(newds, redact_snaps, numredactsnaps, tx); } + if (featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) { + /* + * The source has seen a large microzap at least once in its + * life, so we activate the feature here to match. It's not + * strictly necessary since a large microzap is usable without + * the feature active, but if that object is sent on from here, + * we need this info to know to add the stream feature. + * + * There may be no large microzap in the incoming stream, or + * ever again, but this is a very niche feature and its very + * difficult to spot a large microzap in the stream, so its + * not worth the effort of trying harder to activate the + * feature at first use. + */ + dsl_dataset_activate_feature(dsobj, SPA_FEATURE_LARGE_MICROZAP, + (void *)B_TRUE, tx); + } + dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; /* * Activate longname feature if received */ if (featureflags & DMU_BACKUP_FEATURE_LONGNAME && !dsl_dataset_feature_is_active(newds, SPA_FEATURE_LONGNAME)) { dsl_dataset_activate_feature(newds->ds_object, SPA_FEATURE_LONGNAME, (void *)B_TRUE, tx); newds->ds_feature[SPA_FEATURE_LONGNAME] = (void *)B_TRUE; } /* * If we actually created a non-clone, we need to create the objset * in our new dataset. If this is a raw send we postpone this until * dmu_recv_stream() so that we can allocate the metadnode with the * properties from the DRR_BEGIN payload. */ rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) && (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && !drc->drc_heal) { (void) dmu_objset_create_impl(dp->dp_spa, newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } rrw_exit(&newds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = newds; drba->drba_cookie->drc_os = os; spa_history_log_internal_ds(newds, "receive", tx, " "); } static int dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dmu_recv_cookie_t *drc = drba->drba_cookie; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drc->drc_drrb; int error; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; dsl_dataset_t *ds; const char *tofs = drc->drc_tofs; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES) return (SET_ERROR(EINVAL)); /* * This is mostly a sanity check since we should have already done these * checks during a previous attempt to receive the data. */ error = recv_begin_check_feature_flags_impl(drc->drc_featureflags, dp->dp_spa); if (error != 0) return (error); /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) { /* raw receives require spill block allocation flag */ if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)) return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); } else { dsflags |= DS_HOLD_FLAG_DECRYPT; } boolean_t recvexist = B_TRUE; if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) { /* %recv does not exist; continue in tofs */ recvexist = B_FALSE; error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error != 0) return (error); } /* * Resume of full/newfs recv on existing dataset should be done with * force flag */ if (recvexist && drrb->drr_fromguid == 0 && !drc->drc_force) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(ZFS_ERR_RESUME_EXISTS)); } /* check that ds is marked inconsistent */ if (!DS_IS_INCONSISTENT(ds)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } /* check that there is resuming data, and that the toguid matches */ if (!dsl_dataset_is_zapified(ds)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } uint64_t val; error = zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); if (error != 0 || drrb->drr_toguid != val) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } /* * Check if the receive is still running. If so, it will be owned. * Note that nothing else can own the dataset (e.g. after the receive * fails) because it will be marked inconsistent. */ if (dsl_dataset_has_owner(ds)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EBUSY)); } /* There should not be any snapshots of this fs yet. */ if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } /* * Note: resume point will be checked when we process the first WRITE * record. */ /* check that the origin matches */ val = 0; (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); if (drrb->drr_fromguid != val) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } if (ds->ds_prev != NULL && drrb->drr_fromguid != 0) drc->drc_fromsnapobj = ds->ds_prev->ds_object; /* * If we're resuming, and the send is redacted, then the original send * must have been redacted, and must have been redacted with respect to * the same snapshots. */ if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) { uint64_t num_ds_redact_snaps; uint64_t *ds_redact_snaps; uint_t num_stream_redact_snaps; uint64_t *stream_redact_snaps; if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_SNAPS, &stream_redact_snaps, &num_stream_redact_snaps) != 0) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } if (!dsl_dataset_get_uint64_array_feature(ds, SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps, &ds_redact_snaps)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } for (int i = 0; i < num_ds_redact_snaps; i++) { if (!redact_snaps_contains(ds_redact_snaps, num_ds_redact_snaps, stream_redact_snaps[i])) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } } } error = recv_check_large_blocks(ds, drc->drc_featureflags); if (error != 0) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (error); } dsl_dataset_rele_flags(ds, dsflags, FTAG); return (0); } static void dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); const char *tofs = drba->drba_cookie->drc_tofs; uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); if (featureflags & DMU_BACKUP_FEATURE_RAW) { drba->drba_cookie->drc_raw = B_TRUE; } else { dsflags |= DS_HOLD_FLAG_DECRYPT; } if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds) != 0) { /* %recv does not exist; continue in tofs */ VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag, &ds)); drba->drba_cookie->drc_newfs = B_TRUE; } ASSERT(DS_IS_INCONSISTENT(ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || drba->drba_cookie->drc_raw); rrw_exit(&ds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = ds; VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os)); drba->drba_cookie->drc_should_save = B_TRUE; spa_history_log_internal_ds(ds, "resume receive", tx, " "); } /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int dmu_recv_begin(const char *tofs, const char *tosnap, dmu_replay_record_t *drr_begin, boolean_t force, boolean_t heal, boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args, const char *origin, dmu_recv_cookie_t *drc, zfs_file_t *fp, offset_t *voffp) { dmu_recv_begin_arg_t drba = { 0 }; int err = 0; memset(drc, 0, sizeof (dmu_recv_cookie_t)); drc->drc_drr_begin = drr_begin; drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; drc->drc_heal = heal; drc->drc_resumable = resumable; drc->drc_cred = CRED(); drc->drc_proc = curproc; drc->drc_clone = (origin != NULL); if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; (void) fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); byteswap_record(drr_begin); } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { (void) fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); } else { return (SET_ERROR(EINVAL)); } drc->drc_fp = fp; drc->drc_voff = *voffp; drc->drc_featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; /* * Since OpenZFS 2.0.0, we have enforced a 64MB limit in userspace * configurable via ZFS_SENDRECV_MAX_NVLIST. We enforce 256MB as a hard * upper limit. Systems with less than 1GB of RAM will see a lower * limit from `arc_all_memory() / 4`. */ if (payloadlen > (MIN((1U << 28), arc_all_memory() / 4))) return (E2BIG); if (payloadlen != 0) { void *payload = vmem_alloc(payloadlen, KM_SLEEP); /* * For compatibility with recursive send streams, we don't do * this here if the stream could be part of a package. Instead, * we'll do it in dmu_recv_stream. If we pull the next header * too early, and it's the END record, we break the `recv_skip` * logic. */ err = receive_read_payload_and_next_header(drc, payloadlen, payload); if (err != 0) { vmem_free(payload, payloadlen); return (err); } err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl, KM_SLEEP); vmem_free(payload, payloadlen); if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); return (err); } } if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK) drc->drc_spill = B_TRUE; drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); drba.drba_proc = curproc; if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { err = dsl_sync_task(tofs, dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL); } else { /* * For non-raw, non-incremental, non-resuming receives the * user can specify encryption parameters on the command line * with "zfs recv -o". For these receives we create a dcp and * pass it to the sync task. Creating the dcp will implicitly * remove the encryption params from the localprops nvlist, * which avoids errors when trying to set these normally * read-only properties. Any other kind of receive that * attempts to set these properties will fail as a result. */ if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RAW) == 0 && origin == NULL && drc->drc_drrb->drr_fromguid == 0) { err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, localprops, hidden_args, &drba.drba_dcp); } if (err == 0) { err = dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL); dsl_crypto_params_free(drba.drba_dcp, !!err); } } if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); nvlist_free(drc->drc_begin_nvl); } return (err); } /* * Holds data need for corrective recv callback */ typedef struct cr_cb_data { uint64_t size; zbookmark_phys_t zb; spa_t *spa; } cr_cb_data_t; static void corrective_read_done(zio_t *zio) { cr_cb_data_t *data = zio->io_private; /* Corruption corrected; update error log if needed */ if (zio->io_error == 0) { spa_remove_error(data->spa, &data->zb, BP_GET_LOGICAL_BIRTH(zio->io_bp)); } kmem_free(data, sizeof (cr_cb_data_t)); abd_free(zio->io_abd); } /* * zio_rewrite the data pointed to by bp with the data from the rrd's abd. */ static int do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, struct receive_record_arg *rrd, blkptr_t *bp) { int err; zio_t *io; zbookmark_phys_t zb; dnode_t *dn; abd_t *abd = rrd->abd; zio_cksum_t bp_cksum = bp->blk_cksum; zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL; if (rwa->raw) flags |= ZIO_FLAG_RAW; err = dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn); if (err != 0) return (err); SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), drrw->drr_object, 0, dbuf_whichblock(dn, 0, drrw->drr_offset)); dnode_rele(dn, FTAG); if (!rwa->raw && DRR_WRITE_COMPRESSED(drrw)) { /* Decompress the stream data */ abd_t *dabd = abd_alloc_linear( drrw->drr_logical_size, B_FALSE); err = zio_decompress_data(drrw->drr_compressiontype, abd, dabd, abd_get_size(abd), abd_get_size(dabd), NULL); if (err != 0) { abd_free(dabd); return (err); } /* Swap in the newly decompressed data into the abd */ abd_free(abd); abd = dabd; } if (!rwa->raw && BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* Recompress the data */ abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE); uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp), abd, &cabd, abd_get_size(abd), BP_GET_PSIZE(bp), rwa->os->os_complevel); abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize); /* Swap in newly compressed data into the abd */ abd_free(abd); abd = cabd; flags |= ZIO_FLAG_RAW_COMPRESS; } /* * The stream is not encrypted but the data on-disk is. * We need to re-encrypt the buf using the same * encryption type, salt, iv, and mac that was used to encrypt * the block previosly. */ if (!rwa->raw && BP_USES_CRYPT(bp)) { dsl_dataset_t *ds; dsl_crypto_key_t *dck = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; dsl_pool_t *dp = dmu_objset_pool(rwa->os); abd_t *eabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_flags(dp, rwa->tofs, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (err != 0) { dsl_pool_config_exit(dp, FTAG); abd_free(eabd); return (SET_ERROR(EACCES)); } /* Look up the key from the spa's keystore */ err = spa_keystore_lookup_key(rwa->os->os_spa, zb.zb_objset, FTAG, &dck); if (err != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dsl_pool_config_exit(dp, FTAG); abd_free(eabd); return (SET_ERROR(EACCES)); } err = zio_do_crypt_abd(B_TRUE, &dck->dck_key, BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, abd_get_size(abd), abd, eabd, &no_crypt); spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG); dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dsl_pool_config_exit(dp, FTAG); ASSERT0(no_crypt); if (err != 0) { abd_free(eabd); return (err); } /* Swap in the newly encrypted data into the abd */ abd_free(abd); abd = eabd; /* * We want to prevent zio_rewrite() from trying to * encrypt the data again */ flags |= ZIO_FLAG_RAW_ENCRYPT; } rrd->abd = abd; io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp, abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb); ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) || abd_get_size(abd) == BP_GET_PSIZE(bp)); /* compute new bp checksum value and make sure it matches the old one */ zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, abd_get_size(abd)); if (!ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) { zio_destroy(io); if (zfs_recv_best_effort_corrective != 0) return (0); return (SET_ERROR(ECKSUM)); } /* Correct the corruption in place */ err = zio_wait(io); if (err == 0) { cr_cb_data_t *cb_data = kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP); cb_data->spa = rwa->os->os_spa; cb_data->size = drrw->drr_logical_size; cb_data->zb = zb; /* Test if healing worked by re-reading the bp */ err = zio_wait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp, abd_alloc_for_io(drrw->drr_logical_size, B_FALSE), drrw->drr_logical_size, corrective_read_done, cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL)); } if (err != 0 && zfs_recv_best_effort_corrective != 0) err = 0; return (err); } static int receive_read(dmu_recv_cookie_t *drc, int len, void *buf) { int done = 0; /* * The code doesn't rely on this (lengths being multiples of 8). See * comment in dump_bytes. */ ASSERT(len % 8 == 0 || (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); while (done < len) { ssize_t resid = len - done; zfs_file_t *fp = drc->drc_fp; int err = zfs_file_read(fp, (char *)buf + done, len - done, &resid); if (err == 0 && resid == len - done) { /* * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates * that the receive was interrupted and can * potentially be resumed. */ err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED); } drc->drc_voff += len - done - resid; done = len - resid; if (err != 0) return (err); } drc->drc_bytes_read += len; ASSERT3U(done, ==, len); return (0); } static inline uint8_t deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) { if (bonus_type == DMU_OT_SA) { return (1); } else { return (1 + ((DN_OLD_MAX_BONUSLEN - MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); } } static void save_resume_state(struct receive_writer_arg *rwa, uint64_t object, uint64_t offset, dmu_tx_t *tx) { int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; if (!rwa->resumable) return; /* * We use ds_resume_bytes[] != 0 to indicate that we need to * update this on disk, so it must not be 0. */ ASSERT(rwa->bytes_read != 0); /* * We only resume from write records, which have a valid * (non-meta-dnode) object number. */ ASSERT(object != 0); /* * For resuming to work correctly, we must receive records in order, * sorted by object,offset. This is checked by the callers, but * assert it here for good measure. */ ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); ASSERT3U(rwa->bytes_read, >=, rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; } static int receive_object_is_same_generation(objset_t *os, uint64_t object, dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type, const void *new_bonus, boolean_t *samegenp) { zfs_file_info_t zoi; int err; dmu_buf_t *old_bonus_dbuf; err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf); if (err != 0) return (err); err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data, &zoi); dmu_buf_rele(old_bonus_dbuf, FTAG); if (err != 0) return (err); uint64_t old_gen = zoi.zfi_generation; err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi); if (err != 0) return (err); uint64_t new_gen = zoi.zfi_generation; *samegenp = (old_gen == new_gen); return (0); } static int receive_handle_existing_object(const struct receive_writer_arg *rwa, const struct drr_object *drro, const dmu_object_info_t *doi, const void *bonus_data, uint64_t *object_to_hold, uint32_t *new_blksz) { uint32_t indblksz = drro->drr_indblkshift ? 1ULL << drro->drr_indblkshift : 0; int nblkptr = deduce_nblkptr(drro->drr_bonustype, drro->drr_bonuslen); uint8_t dn_slots = drro->drr_dn_slots != 0 ? drro->drr_dn_slots : DNODE_MIN_SLOTS; boolean_t do_free_range = B_FALSE; int err; *object_to_hold = drro->drr_object; /* nblkptr should be bounded by the bonus size and type */ if (rwa->raw && nblkptr != drro->drr_nblkptr) return (SET_ERROR(EINVAL)); /* * After the previous send stream, the sending system may * have freed this object, and then happened to re-allocate * this object number in a later txg. In this case, we are * receiving a different logical file, and the block size may * appear to be different. i.e. we may have a different * block size for this object than what the send stream says. * In this case we need to remove the object's contents, * so that its structure can be changed and then its contents * entirely replaced by subsequent WRITE records. * * If this is a -L (--large-block) incremental stream, and * the previous stream was not -L, the block size may appear * to increase. i.e. we may have a smaller block size for * this object than what the send stream says. In this case * we need to keep the object's contents and block size * intact, so that we don't lose parts of the object's * contents that are not changed by this incremental send * stream. * * We can distinguish between the two above cases by using * the ZPL's generation number (see * receive_object_is_same_generation()). However, we only * want to rely on the generation number when absolutely * necessary, because with raw receives, the generation is * encrypted. We also want to minimize dependence on the * ZPL, so that other types of datasets can also be received * (e.g. ZVOLs, although note that ZVOLS currently do not * reallocate their objects or change their structure). * Therefore, we check a number of different cases where we * know it is safe to discard the object's contents, before * using the ZPL's generation number to make the above * distinction. */ if (drro->drr_blksz != doi->doi_data_block_size) { if (rwa->raw) { /* * RAW streams always have large blocks, so * we are sure that the data is not needed * due to changing --large-block to be on. * Which is fortunate since the bonus buffer * (which contains the ZPL generation) is * encrypted, and the key might not be * loaded. */ do_free_range = B_TRUE; } else if (rwa->full) { /* * This is a full send stream, so it always * replaces what we have. Even if the * generation numbers happen to match, this * can not actually be the same logical file. * This is relevant when receiving a full * send as a clone. */ do_free_range = B_TRUE; } else if (drro->drr_type != DMU_OT_PLAIN_FILE_CONTENTS || doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) { /* * PLAIN_FILE_CONTENTS are the only type of * objects that have ever been stored with * large blocks, so we don't need the special * logic below. ZAP blocks can shrink (when * there's only one block), so we don't want * to hit the error below about block size * only increasing. */ do_free_range = B_TRUE; } else if (doi->doi_max_offset <= doi->doi_data_block_size) { /* * There is only one block. We can free it, * because its contents will be replaced by a * WRITE record. This can not be the no-L -> * -L case, because the no-L case would have * resulted in multiple blocks. If we * supported -L -> no-L, it would not be safe * to free the file's contents. Fortunately, * that is not allowed (see * recv_check_large_blocks()). */ do_free_range = B_TRUE; } else { boolean_t is_same_gen; err = receive_object_is_same_generation(rwa->os, drro->drr_object, doi->doi_bonus_type, drro->drr_bonustype, bonus_data, &is_same_gen); if (err != 0) return (SET_ERROR(EINVAL)); if (is_same_gen) { /* * This is the same logical file, and * the block size must be increasing. * It could only decrease if * --large-block was changed to be * off, which is checked in * recv_check_large_blocks(). */ if (drro->drr_blksz <= doi->doi_data_block_size) return (SET_ERROR(EINVAL)); /* * We keep the existing blocksize and * contents. */ *new_blksz = doi->doi_data_block_size; } else { do_free_range = B_TRUE; } } } /* nblkptr can only decrease if the object was reallocated */ if (nblkptr < doi->doi_nblkptr) do_free_range = B_TRUE; /* number of slots can only change on reallocation */ if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) do_free_range = B_TRUE; /* * For raw sends we also check a few other fields to * ensure we are preserving the objset structure exactly * as it was on the receive side: * - A changed indirect block size * - A smaller nlevels */ if (rwa->raw) { if (indblksz != doi->doi_metadata_block_size) do_free_range = B_TRUE; if (drro->drr_nlevels < doi->doi_indirection) do_free_range = B_TRUE; } if (do_free_range) { err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } /* * The dmu does not currently support decreasing nlevels or changing * indirect block size if there is already one, same as changing the * number of of dnode slots on an object. For non-raw sends this * does not matter and the new object can just use the previous one's * parameters. For raw sends, however, the structure of the received * dnode (including indirects and dnode slots) must match that of the * send side. Therefore, instead of using dmu_object_reclaim(), we * must free the object completely and call dmu_object_claim_dnsize() * instead. */ if ((rwa->raw && ((doi->doi_indirection > 1 && indblksz != doi->doi_metadata_block_size) || drro->drr_nlevels < doi->doi_indirection)) || dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { err = dmu_free_long_object(rwa->os, drro->drr_object); if (err != 0) return (SET_ERROR(EINVAL)); txg_wait_synced(dmu_objset_pool(rwa->os), 0); *object_to_hold = DMU_NEW_OBJECT; } /* * For raw receives, free everything beyond the new incoming * maxblkid. Normally this would be done with a DRR_FREE * record that would come after this DRR_OBJECT record is * processed. However, for raw receives we manually set the * maxblkid from the drr_maxblkid and so we must first free * everything above that blkid to ensure the DMU is always * consistent with itself. We will never free the first block * of the object here because a maxblkid of 0 could indicate * an object with a single block or one with no blocks. This * free may be skipped when dmu_free_long_range() was called * above since it covers the entire object's contents. */ if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) { err = dmu_free_long_range(rwa->os, drro->drr_object, (drro->drr_maxblkid + 1) * doi->doi_data_block_size, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } return (0); } noinline static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) { dmu_object_info_t doi; dmu_tx_t *tx; int err; uint32_t new_blksz = drro->drr_blksz; uint8_t dn_slots = drro->drr_dn_slots != 0 ? drro->drr_dn_slots : DNODE_MIN_SLOTS; if (drro->drr_type == DMU_OT_NONE || !DMU_OT_IS_VALID(drro->drr_type) || !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || drro->drr_bonuslen > DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) || dn_slots > (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { return (SET_ERROR(EINVAL)); } if (rwa->raw) { /* * We should have received a DRR_OBJECT_RANGE record * containing this block and stored it in rwa. */ if (drro->drr_object < rwa->or_firstobj || drro->drr_object >= rwa->or_firstobj + rwa->or_numslots || drro->drr_raw_bonuslen < drro->drr_bonuslen || drro->drr_indblkshift > SPA_MAXBLOCKSHIFT || drro->drr_nlevels > DN_MAX_LEVELS || drro->drr_nblkptr > DN_MAX_NBLKPTR || DN_SLOTS_TO_BONUSLEN(dn_slots) < drro->drr_raw_bonuslen) return (SET_ERROR(EINVAL)); } else { /* * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN * record indicates this by setting DRR_FLAG_SPILL_BLOCK. */ if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) || (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) { return (SET_ERROR(EINVAL)); } if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 || drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) { return (SET_ERROR(EINVAL)); } } err = dmu_object_info(rwa->os, drro->drr_object, &doi); if (err != 0 && err != ENOENT && err != EEXIST) return (SET_ERROR(EINVAL)); if (drro->drr_object > rwa->max_object) rwa->max_object = drro->drr_object; /* * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file * contents before we can change this type of metadata in the dnode. * Raw receives will also check that the indirect structure of the * dnode hasn't changed. */ uint64_t object_to_hold; if (err == 0) { err = receive_handle_existing_object(rwa, drro, &doi, data, &object_to_hold, &new_blksz); if (err != 0) return (err); } else if (err == EEXIST) { /* * The object requested is currently an interior slot of a * multi-slot dnode. This will be resolved when the next txg * is synced out, since the send stream will have told us * to free this slot when we freed the associated dnode * earlier in the stream. */ txg_wait_synced(dmu_objset_pool(rwa->os), 0); if (dmu_object_info(rwa->os, drro->drr_object, NULL) != ENOENT) return (SET_ERROR(EINVAL)); /* object was freed and we are about to allocate a new one */ object_to_hold = DMU_NEW_OBJECT; } else { /* * If the only record in this range so far was DRR_FREEOBJECTS * with at least one actually freed object, it's possible that * the block will now be converted to a hole. We need to wait * for the txg to sync to prevent races. */ if (rwa->or_need_sync == ORNS_YES) txg_wait_synced(dmu_objset_pool(rwa->os), 0); /* object is free and we are about to allocate a new one */ object_to_hold = DMU_NEW_OBJECT; } /* Only relevant for the first object in the range */ rwa->or_need_sync = ORNS_NO; /* * If this is a multi-slot dnode there is a chance that this * object will expand into a slot that is already used by * another object from the previous snapshot. We must free * these objects before we attempt to allocate the new dnode. */ if (dn_slots > 1) { boolean_t need_sync = B_FALSE; for (uint64_t slot = drro->drr_object + 1; slot < drro->drr_object + dn_slots; slot++) { dmu_object_info_t slot_doi; err = dmu_object_info(rwa->os, slot, &slot_doi); if (err == ENOENT || err == EEXIST) continue; else if (err != 0) return (err); err = dmu_free_long_object(rwa->os, slot); if (err != 0) return (err); need_sync = B_TRUE; } if (need_sync) txg_wait_synced(dmu_objset_pool(rwa->os), 0); } tx = dmu_tx_create(rwa->os); dmu_tx_hold_bonus(tx, object_to_hold); dmu_tx_hold_write(tx, object_to_hold, 0, 0); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } if (object_to_hold == DMU_NEW_OBJECT) { /* Currently free, wants to be allocated */ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || new_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* Currently allocated, but with different properties */ err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, rwa->spill ? DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx); } else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) { /* * Currently allocated, the existing version of this object * may reference a spill block that is no longer allocated * at the source and needs to be freed. */ err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx); } if (err != 0) { dmu_tx_commit(tx); return (SET_ERROR(EINVAL)); } if (rwa->or_crypt_params_present) { /* * Set the crypt params for the buffer associated with this * range of dnodes. This causes the blkptr_t to have the * same crypt params (byteorder, salt, iv, mac) as on the * sending side. * * Since we are committing this tx now, it is possible for * the dnode block to end up on-disk with the incorrect MAC, * if subsequent objects in this block are received in a * different txg. However, since the dataset is marked as * inconsistent, no code paths will do a non-raw read (or * decrypt the block / verify the MAC). The receive code and * scrub code can safely do raw reads and verify the * checksum. They don't need to verify the MAC. */ dmu_buf_t *db = NULL; uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE; err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os), offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT); if (err != 0) { dmu_tx_commit(tx); return (SET_ERROR(EINVAL)); } dmu_buf_set_crypt_params(db, rwa->or_byteorder, rwa->or_salt, rwa->or_iv, rwa->or_mac, tx); dmu_buf_rele(db, FTAG); rwa->or_crypt_params_present = B_FALSE; } dmu_object_set_checksum(rwa->os, drro->drr_object, drro->drr_checksumtype, tx); dmu_object_set_compress(rwa->os, drro->drr_object, drro->drr_compress, tx); /* handle more restrictive dnode structuring for raw recvs */ if (rwa->raw) { /* * Set the indirect block size, block shift, nlevels. * This will not fail because we ensured all of the * blocks were freed earlier if this is a new object. * For non-new objects block size and indirect block * shift cannot change and nlevels can only increase. */ ASSERT3U(new_blksz, ==, drro->drr_blksz); VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, drro->drr_blksz, drro->drr_indblkshift, tx)); VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, drro->drr_nlevels, tx)); /* * Set the maxblkid. This will always succeed because * we freed all blocks beyond the new maxblkid above. */ VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object, drro->drr_maxblkid, tx)); } if (data != NULL) { dmu_buf_t *db; dnode_t *dn; uint32_t flags = DMU_READ_NO_PREFETCH; if (rwa->raw) flags |= DMU_READ_NO_DECRYPT; VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn)); VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); memcpy(db->db_data, data, DRR_OBJECT_PAYLOAD_SIZE(drro)); /* * Raw bonus buffers have their byteorder determined by the * DRR_OBJECT_RANGE record. */ if (rwa->byteswap && !rwa->raw) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro)); } dmu_buf_rele(db, FTAG); dnode_rele(dn, FTAG); } /* * If the receive fails, we want the resume stream to start with the * same record that we last successfully received. There is no way to * request resume from the object record, but we can benefit from the * fact that sender always sends object record before anything else, * after which it will "resend" data at offset 0 and resume normally. */ save_resume_state(rwa, drro->drr_object, 0, tx); dmu_tx_commit(tx); return (0); } noinline static int receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; int next_err = 0; if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (SET_ERROR(EINVAL)); for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs && obj < DN_MAX_OBJECT && next_err == 0; next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { dmu_object_info_t doi; int err; err = dmu_object_info(rwa->os, obj, &doi); if (err == ENOENT) continue; else if (err != 0) return (err); err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); if (rwa->or_need_sync == ORNS_MAYBE) rwa->or_need_sync = ORNS_YES; } if (next_err != ESRCH) return (next_err); return (0); } /* * Note: if this fails, the caller will clean up any records left on the * rwa->write_batch list. */ static int flush_write_batch_impl(struct receive_writer_arg *rwa) { dnode_t *dn; int err; if (dnode_hold(rwa->os, rwa->last_object, FTAG, &dn) != 0) return (SET_ERROR(EINVAL)); struct receive_record_arg *last_rrd = list_tail(&rwa->write_batch); struct drr_write *last_drrw = &last_rrd->header.drr_u.drr_write; struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; ASSERT3U(rwa->last_object, ==, last_drrw->drr_object); ASSERT3U(rwa->last_offset, ==, last_drrw->drr_offset); dmu_tx_t *tx = dmu_tx_create(rwa->os); dmu_tx_hold_write_by_dnode(tx, dn, first_drrw->drr_offset, last_drrw->drr_offset - first_drrw->drr_offset + last_drrw->drr_logical_size); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); dnode_rele(dn, FTAG); return (err); } struct receive_record_arg *rrd; while ((rrd = list_head(&rwa->write_batch)) != NULL) { struct drr_write *drrw = &rrd->header.drr_u.drr_write; abd_t *abd = rrd->abd; ASSERT3U(drrw->drr_object, ==, rwa->last_object); if (drrw->drr_logical_size != dn->dn_datablksz) { /* * The WRITE record is larger than the object's block * size. We must be receiving an incremental * large-block stream into a dataset that previously did * a non-large-block receive. Lightweight writes must * be exactly one block, so we need to decompress the * data (if compressed) and do a normal dmu_write(). */ ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz); if (DRR_WRITE_COMPRESSED(drrw)) { abd_t *decomp_abd = abd_alloc_linear(drrw->drr_logical_size, B_FALSE); err = zio_decompress_data( drrw->drr_compressiontype, abd, decomp_abd, abd_get_size(abd), abd_get_size(decomp_abd), NULL); if (err == 0) { dmu_write_by_dnode(dn, drrw->drr_offset, drrw->drr_logical_size, abd_to_buf(decomp_abd), tx); } abd_free(decomp_abd); } else { dmu_write_by_dnode(dn, drrw->drr_offset, drrw->drr_logical_size, abd_to_buf(abd), tx); } if (err == 0) abd_free(abd); } else { zio_prop_t zp = {0}; dmu_write_policy(rwa->os, dn, 0, 0, &zp); zio_flag_t zio_flags = 0; if (rwa->raw) { zp.zp_encrypt = B_TRUE; zp.zp_compress = drrw->drr_compressiontype; zp.zp_byteorder = ZFS_HOST_BYTEORDER ^ !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ rwa->byteswap; memcpy(zp.zp_salt, drrw->drr_salt, ZIO_DATA_SALT_LEN); memcpy(zp.zp_iv, drrw->drr_iv, ZIO_DATA_IV_LEN); memcpy(zp.zp_mac, drrw->drr_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) { zp.zp_nopwrite = B_FALSE; zp.zp_copies = MIN(zp.zp_copies, SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (DRR_WRITE_COMPRESSED(drrw)) { ASSERT3U(drrw->drr_compressed_size, >, 0); ASSERT3U(drrw->drr_logical_size, >=, drrw->drr_compressed_size); zp.zp_compress = drrw->drr_compressiontype; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } else if (rwa->byteswap) { /* * Note: compressed blocks never need to be * byteswapped, because WRITE records for * metadata blocks are never compressed. The * exception is raw streams, which are written * in the original byteorder, and the byteorder * bit is preserved in the BP by setting * zp_byteorder above. */ dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func( abd_to_buf(abd), DRR_WRITE_PAYLOAD_SIZE(drrw)); } /* * Since this data can't be read until the receive * completes, we can do a "lightweight" write for * improved performance. */ err = dmu_lightweight_write_by_dnode(dn, drrw->drr_offset, abd, &zp, zio_flags, tx); } if (err != 0) { /* * This rrd is left on the list, so the caller will * free it (and the abd). */ break; } /* * Note: If the receive fails, we want the resume stream to * start with the same record that we last successfully * received (as opposed to the next record), so that we can * verify that we are resuming from the correct location. */ save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); list_remove(&rwa->write_batch, rrd); kmem_free(rrd, sizeof (*rrd)); } dmu_tx_commit(tx); dnode_rele(dn, FTAG); return (err); } noinline static int flush_write_batch(struct receive_writer_arg *rwa) { if (list_is_empty(&rwa->write_batch)) return (0); int err = rwa->err; if (err == 0) err = flush_write_batch_impl(rwa); if (err != 0) { struct receive_record_arg *rrd; while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) { abd_free(rrd->abd); kmem_free(rrd, sizeof (*rrd)); } } ASSERT(list_is_empty(&rwa->write_batch)); return (err); } noinline static int receive_process_write_record(struct receive_writer_arg *rwa, struct receive_record_arg *rrd) { int err = 0; ASSERT3U(rrd->header.drr_type, ==, DRR_WRITE); struct drr_write *drrw = &rrd->header.drr_u.drr_write; if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); if (rwa->heal) { blkptr_t *bp; dmu_buf_t *dbp; int flags = DB_RF_CANFAIL; if (rwa->raw) flags |= DB_RF_NO_DECRYPT; if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(rrd->abd), DRR_WRITE_PAYLOAD_SIZE(drrw)); } err = dmu_buf_hold_noread(rwa->os, drrw->drr_object, drrw->drr_offset, FTAG, &dbp); if (err != 0) return (err); /* Try to read the object to see if it needs healing */ err = dbuf_read((dmu_buf_impl_t *)dbp, NULL, flags); /* * We only try to heal when dbuf_read() returns a ECKSUMs. * Other errors (even EIO) get returned to caller. * EIO indicates that the device is not present/accessible, * so writing to it will likely fail. * If the block is healthy, we don't want to overwrite it * unnecessarily. */ if (err != ECKSUM) { dmu_buf_rele(dbp, FTAG); return (err); } /* Make sure the on-disk block and recv record sizes match */ if (drrw->drr_logical_size != dbp->db_size) { err = ENOTSUP; dmu_buf_rele(dbp, FTAG); return (err); } /* Get the block pointer for the corrupted block */ bp = dmu_buf_get_blkptr(dbp); err = do_corrective_recv(rwa, drrw, rrd, bp); dmu_buf_rele(dbp, FTAG); return (err); } /* * For resuming to work, records must be in increasing order * by (object, offset). */ if (drrw->drr_object < rwa->last_object || (drrw->drr_object == rwa->last_object && drrw->drr_offset < rwa->last_offset)) { return (SET_ERROR(EINVAL)); } struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; uint64_t batch_size = MIN(zfs_recv_write_batch_size, DMU_MAX_ACCESS / 2); if (first_rrd != NULL && (drrw->drr_object != first_drrw->drr_object || drrw->drr_offset >= first_drrw->drr_offset + batch_size)) { err = flush_write_batch(rwa); if (err != 0) return (err); } rwa->last_object = drrw->drr_object; rwa->last_offset = drrw->drr_offset; if (rwa->last_object > rwa->max_object) rwa->max_object = rwa->last_object; list_insert_tail(&rwa->write_batch, rrd); /* * Return EAGAIN to indicate that we will use this rrd again, * so the caller should not free it */ return (EAGAIN); } static int receive_write_embedded(struct receive_writer_arg *rwa, struct drr_write_embedded *drrwe, void *data) { dmu_tx_t *tx; int err; if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) return (SET_ERROR(EINVAL)); if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) return (SET_ERROR(EINVAL)); if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) return (SET_ERROR(EINVAL)); if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (SET_ERROR(EINVAL)); if (rwa->raw) return (SET_ERROR(EINVAL)); if (drrwe->drr_object > rwa->max_object) rwa->max_object = drrwe->drr_object; tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } dmu_write_embedded(rwa->os, drrwe->drr_object, drrwe->drr_offset, data, drrwe->drr_etype, drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); /* See comment in restore_write. */ save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); dmu_tx_commit(tx); return (0); } static int receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, abd_t *abd) { dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) return (SET_ERROR(EINVAL)); /* * This is an unmodified spill block which was added to the stream * to resolve an issue with incorrectly removing spill blocks. It * should be ignored by current versions of the code which support * the DRR_FLAG_SPILL_BLOCK flag. */ if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) { abd_free(abd); return (0); } if (rwa->raw) { if (!DMU_OT_IS_VALID(drrs->drr_type) || drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS || drrs->drr_compressed_size == 0) return (SET_ERROR(EINVAL)); } if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); if (drrs->drr_object > rwa->max_object) rwa->max_object = drrs->drr_object; VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } dmu_tx_t *tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_abort(tx); return (err); } /* * Spill blocks may both grow and shrink. When a change in size * occurs any existing dbuf must be updated to match the logical * size of the provided arc_buf_t. */ if (db_spill->db_size != drrs->drr_length) { dmu_buf_will_fill(db_spill, tx, B_FALSE); VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); } arc_buf_t *abuf; if (rwa->raw) { boolean_t byteorder = ZFS_HOST_BYTEORDER ^ !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ rwa->byteswap; abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os), drrs->drr_object, byteorder, drrs->drr_salt, drrs->drr_iv, drrs->drr_mac, drrs->drr_type, drrs->drr_compressed_size, drrs->drr_length, drrs->drr_compressiontype, 0); } else { abuf = arc_loan_buf(dmu_objset_spa(rwa->os), DMU_OT_IS_METADATA(drrs->drr_type), drrs->drr_length); if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrs->drr_type); dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs)); } } memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs)); abd_free(abd); dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_commit(tx); return (0); } noinline static int receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { int err; if (drrf->drr_length != -1ULL && drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); if (drrf->drr_object > rwa->max_object) rwa->max_object = drrf->drr_object; err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); return (err); } static int receive_object_range(struct receive_writer_arg *rwa, struct drr_object_range *drror) { /* * By default, we assume this block is in our native format * (ZFS_HOST_BYTEORDER). We then take into account whether * the send stream is byteswapped (rwa->byteswap). Finally, * we need to byteswap again if this particular block was * in non-native format on the send side. */ boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^ !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags); /* * Since dnode block sizes are constant, we should not need to worry * about making sure that the dnode block size is the same on the * sending and receiving sides for the time being. For non-raw sends, * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE * record at all). Raw sends require this record type because the * encryption parameters are used to protect an entire block of bonus * buffers. If the size of dnode blocks ever becomes variable, * handling will need to be added to ensure that dnode block sizes * match on the sending and receiving side. */ if (drror->drr_numslots != DNODES_PER_BLOCK || P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 || !rwa->raw) return (SET_ERROR(EINVAL)); if (drror->drr_firstobj > rwa->max_object) rwa->max_object = drror->drr_firstobj; /* * The DRR_OBJECT_RANGE handling must be deferred to receive_object() * so that the block of dnodes is not written out when it's empty, * and converted to a HOLE BP. */ rwa->or_crypt_params_present = B_TRUE; rwa->or_firstobj = drror->drr_firstobj; rwa->or_numslots = drror->drr_numslots; memcpy(rwa->or_salt, drror->drr_salt, ZIO_DATA_SALT_LEN); memcpy(rwa->or_iv, drror->drr_iv, ZIO_DATA_IV_LEN); memcpy(rwa->or_mac, drror->drr_mac, ZIO_DATA_MAC_LEN); rwa->or_byteorder = byteorder; rwa->or_need_sync = ORNS_MAYBE; return (0); } /* * Until we have the ability to redact large ranges of data efficiently, we * process these records as frees. */ noinline static int receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr) { struct drr_free drrf = {0}; drrf.drr_length = drrr->drr_length; drrf.drr_object = drrr->drr_object; drrf.drr_offset = drrr->drr_offset; drrf.drr_toguid = drrr->drr_toguid; return (receive_free(rwa, &drrf)); } /* used to destroy the drc_ds on error */ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { dsl_dataset_t *ds = drc->drc_ds; ds_hold_flags_t dsflags; dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; /* * Wait for the txg sync before cleaning up the receive. For * resumable receives, this ensures that our resume state has * been written out to disk. For raw receives, this ensures * that the user accounting code will not attempt to do anything * after we stopped receiving the dataset. */ txg_wait_synced(ds->ds_dir->dd_pool, 0); ds->ds_objset->os_raw_receive = B_FALSE; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); if (drc->drc_resumable && drc->drc_should_save && !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_disown(ds, dsflags, dmu_recv_tag); } else { char name[ZFS_MAX_DATASET_NAME_LEN]; rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_name(ds, name); dsl_dataset_disown(ds, dsflags, dmu_recv_tag); if (!drc->drc_heal) (void) dsl_destroy_head(name); } } static void receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf) { if (drc->drc_byteswap) { (void) fletcher_4_incremental_byteswap(buf, len, &drc->drc_cksum); } else { (void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum); } } /* * Read the payload into a buffer of size len, and update the current record's * payload field. * Allocate drc->drc_next_rrd and read the next record's header into * drc->drc_next_rrd->header. * Verify checksum of payload and next record. */ static int receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf) { int err; if (len != 0) { ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); err = receive_read(drc, len, buf); if (err != 0) return (err); receive_cksum(drc, len, buf); /* note: rrd is NULL when reading the begin record's payload */ if (drc->drc_rrd != NULL) { drc->drc_rrd->payload = buf; drc->drc_rrd->payload_size = len; drc->drc_rrd->bytes_read = drc->drc_bytes_read; } } else { ASSERT3P(buf, ==, NULL); } drc->drc_prev_cksum = drc->drc_cksum; drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP); err = receive_read(drc, sizeof (drc->drc_next_rrd->header), &drc->drc_next_rrd->header); drc->drc_next_rrd->bytes_read = drc->drc_bytes_read; if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); drc->drc_next_rrd = NULL; return (err); } if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); drc->drc_next_rrd = NULL; return (SET_ERROR(EINVAL)); } /* * Note: checksum is of everything up to but not including the * checksum itself. */ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); receive_cksum(drc, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &drc->drc_next_rrd->header); zio_cksum_t cksum_orig = drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; zio_cksum_t *cksump = &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; if (drc->drc_byteswap) byteswap_record(&drc->drc_next_rrd->header); if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); drc->drc_next_rrd = NULL; return (SET_ERROR(ECKSUM)); } receive_cksum(drc, sizeof (cksum_orig), &cksum_orig); return (0); } /* * Issue the prefetch reads for any necessary indirect blocks. * * We use the object ignore list to tell us whether or not to issue prefetches * for a given object. We do this for both correctness (in case the blocksize * of an object has changed) and performance (if the object doesn't exist, don't * needlessly try to issue prefetches). We also trim the list as we go through * the stream to prevent it from growing to an unbounded size. * * The object numbers within will always be in sorted order, and any write * records we see will also be in sorted order, but they're not sorted with * respect to each other (i.e. we can get several object records before * receiving each object's write records). As a result, once we've reached a * given object number, we can safely remove any reference to lower object * numbers in the ignore list. In practice, we receive up to 32 object records * before receiving write records, so the list can have up to 32 nodes in it. */ static void receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset, uint64_t length) { if (!objlist_exists(drc->drc_ignore_objlist, object)) { dmu_prefetch(drc->drc_os, object, 1, offset, length, ZIO_PRIORITY_SYNC_READ); } } /* * Read records off the stream, issuing any necessary prefetches. */ static int receive_read_record(dmu_recv_cookie_t *drc) { int err; switch (drc->drc_rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &drc->drc_rrd->header.drr_u.drr_object; uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro); void *buf = NULL; dmu_object_info_t doi; if (size != 0) buf = kmem_zalloc(size, KM_SLEEP); err = receive_read_payload_and_next_header(drc, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } err = dmu_object_info(drc->drc_os, drro->drr_object, &doi); /* * See receive_read_prefetch for an explanation why we're * storing this object in the ignore_obj_list. */ if (err == ENOENT || err == EEXIST || (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { objlist_insert(drc->drc_ignore_objlist, drro->drr_object); err = 0; } return (err); } case DRR_FREEOBJECTS: { err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); } case DRR_WRITE: { struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write; int size = DRR_WRITE_PAYLOAD_SIZE(drrw); abd_t *abd = abd_alloc_linear(size, B_FALSE); err = receive_read_payload_and_next_header(drc, size, abd_to_buf(abd)); if (err != 0) { abd_free(abd); return (err); } drc->drc_rrd->abd = abd; receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); return (err); } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &drc->drc_rrd->header.drr_u.drr_write_embedded; uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); void *buf = kmem_zalloc(size, KM_SLEEP); err = receive_read_payload_and_next_header(drc, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length); return (err); } case DRR_FREE: case DRR_REDACT: { /* * It might be beneficial to prefetch indirect blocks here, but * we don't really have the data to decide for sure. */ err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); } case DRR_END: { struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum, drre->drr_checksum)) return (SET_ERROR(ECKSUM)); return (0); } case DRR_SPILL: { struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill; int size = DRR_SPILL_PAYLOAD_SIZE(drrs); abd_t *abd = abd_alloc_linear(size, B_FALSE); err = receive_read_payload_and_next_header(drc, size, abd_to_buf(abd)); if (err != 0) abd_free(abd); else drc->drc_rrd->abd = abd; return (err); } case DRR_OBJECT_RANGE: { err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); } default: return (SET_ERROR(EINVAL)); } } static void dprintf_drr(struct receive_record_arg *rrd, int err) { #ifdef ZFS_DEBUG switch (rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &rrd->header.drr_u.drr_object; dprintf("drr_type = OBJECT obj = %llu type = %u " "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u " "compress = %u dn_slots = %u err = %d\n", (u_longlong_t)drro->drr_object, drro->drr_type, drro->drr_bonustype, drro->drr_blksz, drro->drr_bonuslen, drro->drr_checksumtype, drro->drr_compress, drro->drr_dn_slots, err); break; } case DRR_FREEOBJECTS: { struct drr_freeobjects *drrfo = &rrd->header.drr_u.drr_freeobjects; dprintf("drr_type = FREEOBJECTS firstobj = %llu " "numobjs = %llu err = %d\n", (u_longlong_t)drrfo->drr_firstobj, (u_longlong_t)drrfo->drr_numobjs, err); break; } case DRR_WRITE: { struct drr_write *drrw = &rrd->header.drr_u.drr_write; dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu " "lsize = %llu cksumtype = %u flags = %u " "compress = %u psize = %llu err = %d\n", (u_longlong_t)drrw->drr_object, drrw->drr_type, (u_longlong_t)drrw->drr_offset, (u_longlong_t)drrw->drr_logical_size, drrw->drr_checksumtype, drrw->drr_flags, drrw->drr_compressiontype, (u_longlong_t)drrw->drr_compressed_size, err); break; } case DRR_WRITE_BYREF: { struct drr_write_byref *drrwbr = &rrd->header.drr_u.drr_write_byref; dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu " "length = %llu toguid = %llx refguid = %llx " "refobject = %llu refoffset = %llu cksumtype = %u " "flags = %u err = %d\n", (u_longlong_t)drrwbr->drr_object, (u_longlong_t)drrwbr->drr_offset, (u_longlong_t)drrwbr->drr_length, (u_longlong_t)drrwbr->drr_toguid, (u_longlong_t)drrwbr->drr_refguid, (u_longlong_t)drrwbr->drr_refobject, (u_longlong_t)drrwbr->drr_refoffset, drrwbr->drr_checksumtype, drrwbr->drr_flags, err); break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &rrd->header.drr_u.drr_write_embedded; dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu " "length = %llu compress = %u etype = %u lsize = %u " "psize = %u err = %d\n", (u_longlong_t)drrwe->drr_object, (u_longlong_t)drrwe->drr_offset, (u_longlong_t)drrwe->drr_length, drrwe->drr_compression, drrwe->drr_etype, drrwe->drr_lsize, drrwe->drr_psize, err); break; } case DRR_FREE: { struct drr_free *drrf = &rrd->header.drr_u.drr_free; dprintf("drr_type = FREE obj = %llu offset = %llu " "length = %lld err = %d\n", (u_longlong_t)drrf->drr_object, (u_longlong_t)drrf->drr_offset, (longlong_t)drrf->drr_length, err); break; } case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; dprintf("drr_type = SPILL obj = %llu length = %llu " "err = %d\n", (u_longlong_t)drrs->drr_object, (u_longlong_t)drrs->drr_length, err); break; } case DRR_OBJECT_RANGE: { struct drr_object_range *drror = &rrd->header.drr_u.drr_object_range; dprintf("drr_type = OBJECT_RANGE firstobj = %llu " "numslots = %llu flags = %u err = %d\n", (u_longlong_t)drror->drr_firstobj, (u_longlong_t)drror->drr_numslots, drror->drr_flags, err); break; } default: return; } #endif } /* * Commit the records to the pool. */ static int receive_process_record(struct receive_writer_arg *rwa, struct receive_record_arg *rrd) { int err; /* Processing in order, therefore bytes_read should be increasing. */ ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); rwa->bytes_read = rrd->bytes_read; /* We can only heal write records; other ones get ignored */ if (rwa->heal && rrd->header.drr_type != DRR_WRITE) { if (rrd->abd != NULL) { abd_free(rrd->abd); rrd->abd = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } return (0); } if (!rwa->heal && rrd->header.drr_type != DRR_WRITE) { err = flush_write_batch(rwa); if (err != 0) { if (rrd->abd != NULL) { abd_free(rrd->abd); rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } return (err); } } switch (rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &rrd->header.drr_u.drr_object; err = receive_object(rwa, drro, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; break; } case DRR_FREEOBJECTS: { struct drr_freeobjects *drrfo = &rrd->header.drr_u.drr_freeobjects; err = receive_freeobjects(rwa, drrfo); break; } case DRR_WRITE: { err = receive_process_write_record(rwa, rrd); if (rwa->heal) { /* * If healing - always free the abd after processing */ abd_free(rrd->abd); rrd->abd = NULL; } else if (err != EAGAIN) { /* * On success, a non-healing * receive_process_write_record() returns * EAGAIN to indicate that we do not want to free * the rrd or arc_buf. */ ASSERT(err != 0); abd_free(rrd->abd); rrd->abd = NULL; } break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &rrd->header.drr_u.drr_write_embedded; err = receive_write_embedded(rwa, drrwe, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; break; } case DRR_FREE: { struct drr_free *drrf = &rrd->header.drr_u.drr_free; err = receive_free(rwa, drrf); break; } case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; err = receive_spill(rwa, drrs, rrd->abd); if (err != 0) abd_free(rrd->abd); rrd->abd = NULL; rrd->payload = NULL; break; } case DRR_OBJECT_RANGE: { struct drr_object_range *drror = &rrd->header.drr_u.drr_object_range; err = receive_object_range(rwa, drror); break; } case DRR_REDACT: { struct drr_redact *drrr = &rrd->header.drr_u.drr_redact; err = receive_redact(rwa, drrr); break; } default: err = (SET_ERROR(EINVAL)); } if (err != 0) dprintf_drr(rrd, err); return (err); } /* * dmu_recv_stream's worker thread; pull records off the queue, and then call * receive_process_record When we're done, signal the main thread and exit. */ static __attribute__((noreturn)) void receive_writer_thread(void *arg) { struct receive_writer_arg *rwa = arg; struct receive_record_arg *rrd; fstrans_cookie_t cookie = spl_fstrans_mark(); for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; rrd = bqueue_dequeue(&rwa->q)) { /* * If there's an error, the main thread will stop putting things * on the queue, but we need to clear everything in it before we * can exit. */ int err = 0; if (rwa->err == 0) { err = receive_process_record(rwa, rrd); } else if (rrd->abd != NULL) { abd_free(rrd->abd); rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } /* * EAGAIN indicates that this record has been saved (on * raw->write_batch), and will be used again, so we don't * free it. * When healing data we always need to free the record. */ if (err != EAGAIN || rwa->heal) { if (rwa->err == 0) rwa->err = err; kmem_free(rrd, sizeof (*rrd)); } } kmem_free(rrd, sizeof (*rrd)); if (rwa->heal) { zio_wait(rwa->heal_pio); } else { int err = flush_write_batch(rwa); if (rwa->err == 0) rwa->err = err; } mutex_enter(&rwa->mutex); rwa->done = B_TRUE; cv_signal(&rwa->cv); mutex_exit(&rwa->mutex); spl_fstrans_unmark(cookie); thread_exit(); } static int resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl) { uint64_t val; objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset; uint64_t dsobj = dmu_objset_id(drc->drc_os); uint64_t resume_obj, resume_off; if (nvlist_lookup_uint64(begin_nvl, "resume_object", &resume_obj) != 0 || nvlist_lookup_uint64(begin_nvl, "resume_offset", &resume_off) != 0) { return (SET_ERROR(EINVAL)); } VERIFY0(zap_lookup(mos, dsobj, DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); if (resume_obj != val) return (SET_ERROR(EINVAL)); VERIFY0(zap_lookup(mos, dsobj, DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); if (resume_off != val) return (SET_ERROR(EINVAL)); return (0); } /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a * worker thread, read the records off the stream one by one, and issue * prefetches for any necessary indirect blocks. It will then push the records * onto an internal blocking queue. The worker thread will pull the records off * the queue, and actually write the data into the DMU. This way, the worker * thread doesn't have to wait for reads to complete, since everything it needs * (the indirect blocks) will be prefetched. * * NB: callers *must* call dmu_recv_end() if this succeeds. */ int dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) { int err = 0; struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); if (dsl_dataset_has_resume_receive_state(drc->drc_ds)) { uint64_t bytes = 0; (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, sizeof (bytes), 1, &bytes); drc->drc_bytes_read += bytes; } drc->drc_ignore_objlist = objlist_create(); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, DMU_SUBSTREAM); ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); ASSERT0(drc->drc_os->os_encrypted && (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)); /* handle DSL encryption key payload */ if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) { nvlist_t *keynvl = NULL; ASSERT(drc->drc_os->os_encrypted); ASSERT(drc->drc_raw); err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata", &keynvl); if (err != 0) goto out; if (!drc->drc_heal) { /* * If this is a new dataset we set the key immediately. * Otherwise we don't want to change the key until we * are sure the rest of the receive succeeded so we * stash the keynvl away until then. */ err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), drc->drc_ds->ds_object, drc->drc_fromsnapobj, drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); if (err != 0) goto out; } /* see comment in dmu_recv_end_sync() */ drc->drc_ivset_guid = 0; (void) nvlist_lookup_uint64(keynvl, "to_ivset_guid", &drc->drc_ivset_guid); if (!drc->drc_newfs) drc->drc_keynvl = fnvlist_dup(keynvl); } if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { err = resume_check(drc, drc->drc_begin_nvl); if (err != 0) goto out; } /* * For compatibility with recursive send streams, we do this here, * rather than in dmu_recv_begin. If we pull the next header too * early, and it's the END record, we break the `recv_skip` logic. */ if (drc->drc_drr_begin->drr_payloadlen == 0) { err = receive_read_payload_and_next_header(drc, 0, NULL); if (err != 0) goto out; } /* * If we failed before this point we will clean up any new resume * state that was created. Now that we've gotten past the initial * checks we are ok to retain that resume state. */ drc->drc_should_save = B_TRUE; (void) bqueue_init(&rwa->q, zfs_recv_queue_ff, MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize), offsetof(struct receive_record_arg, node)); cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL); mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); rwa->os = drc->drc_os; rwa->byteswap = drc->drc_byteswap; rwa->heal = drc->drc_heal; rwa->tofs = drc->drc_tofs; rwa->resumable = drc->drc_resumable; rwa->raw = drc->drc_raw; rwa->spill = drc->drc_spill; rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0); rwa->os->os_raw_receive = drc->drc_raw; if (drc->drc_heal) { rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL, ZIO_FLAG_GODFATHER); } list_create(&rwa->write_batch, sizeof (struct receive_record_arg), offsetof(struct receive_record_arg, node.bqn_node)); (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, TS_RUN, minclsyspri); /* * We're reading rwa->err without locks, which is safe since we are the * only reader, and the worker thread is the only writer. It's ok if we * miss a write for an iteration or two of the loop, since the writer * thread will keep freeing records we send it until we send it an eos * marker. * * We can leave this loop in 3 ways: First, if rwa->err is * non-zero. In that case, the writer thread will free the rrd we just * pushed. Second, if we're interrupted; in that case, either it's the * first loop and drc->drc_rrd was never allocated, or it's later, and * drc->drc_rrd has been handed off to the writer thread who will free * it. Finally, if receive_read_record fails or we're at the end of the * stream, then we free drc->drc_rrd and exit. */ while (rwa->err == 0) { if (issig()) { err = SET_ERROR(EINTR); break; } ASSERT3P(drc->drc_rrd, ==, NULL); drc->drc_rrd = drc->drc_next_rrd; drc->drc_next_rrd = NULL; /* Allocates and loads header into drc->drc_next_rrd */ err = receive_read_record(drc); if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) { kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd)); drc->drc_rrd = NULL; break; } bqueue_enqueue(&rwa->q, drc->drc_rrd, sizeof (struct receive_record_arg) + drc->drc_rrd->payload_size); drc->drc_rrd = NULL; } ASSERT3P(drc->drc_rrd, ==, NULL); drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP); drc->drc_rrd->eos_marker = B_TRUE; bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1); mutex_enter(&rwa->mutex); while (!rwa->done) { /* * We need to use cv_wait_sig() so that any process that may * be sleeping here can still fork. */ (void) cv_wait_sig(&rwa->cv, &rwa->mutex); } mutex_exit(&rwa->mutex); /* * If we are receiving a full stream as a clone, all object IDs which * are greater than the maximum ID referenced in the stream are * by definition unused and must be freed. */ if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { uint64_t obj = rwa->max_object + 1; int free_err = 0; int next_err = 0; while (next_err == 0) { free_err = dmu_free_long_object(rwa->os, obj); if (free_err != 0 && free_err != ENOENT) break; next_err = dmu_object_next(rwa->os, &obj, FALSE, 0); } if (err == 0) { if (free_err != 0 && free_err != ENOENT) err = free_err; else if (next_err != ESRCH) err = next_err; } } cv_destroy(&rwa->cv); mutex_destroy(&rwa->mutex); bqueue_destroy(&rwa->q); list_destroy(&rwa->write_batch); if (err == 0) err = rwa->err; out: /* * If we hit an error before we started the receive_writer_thread * we need to clean up the next_rrd we create by processing the * DRR_BEGIN record. */ if (drc->drc_next_rrd != NULL) kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); /* * The objset will be invalidated by dmu_recv_end() when we do * dsl_dataset_clone_swap_sync_impl(). */ drc->drc_os = NULL; kmem_free(rwa, sizeof (*rwa)); nvlist_free(drc->drc_begin_nvl); if (err != 0) { /* * Clean up references. If receive is not resumable, * destroy what we created, so we don't leave it in * the inconsistent state. */ dmu_recv_cleanup_ds(drc); nvlist_free(drc->drc_keynvl); } objlist_destroy(drc->drc_ignore_objlist); drc->drc_ignore_objlist = NULL; *voffp = drc->drc_voff; return (err); } static int dmu_recv_end_check(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); int error; ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); if (drc->drc_heal) { error = 0; } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); if (error != 0) return (error); if (drc->drc_force) { /* * We will destroy any snapshots in tofs (i.e. before * origin_head) that are after the origin (which is * the snap before drc_ds, because drc_ds can not * have any snaps of its own). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) break; if (snap->ds_dir != origin_head->ds_dir) error = SET_ERROR(EINVAL); if (error == 0) { error = dsl_destroy_snapshot_check_impl( snap, B_FALSE); } obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); if (error != 0) break; } if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } } if (drc->drc_keynvl != NULL) { error = dsl_crypto_recv_raw_key_check(drc->drc_ds, drc->drc_keynvl, tx); if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } } error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, origin_head, drc->drc_force, drc->drc_owner, tx); if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } error = dsl_dataset_snapshot_check_impl(origin_head, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred, drc->drc_proc); dsl_dataset_rele(origin_head, FTAG); if (error != 0) return (error); error = dsl_destroy_head_check_impl(drc->drc_ds, 1); } else { error = dsl_dataset_snapshot_check_impl(drc->drc_ds, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred, drc->drc_proc); } return (error); } static void dmu_recv_end_sync(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0; uint64_t newsnapobj = 0; spa_history_log_internal_ds(drc->drc_ds, "finish receiving", tx, "snap=%s", drc->drc_tosnap); drc->drc_ds->ds_objset->os_raw_receive = B_FALSE; if (drc->drc_heal) { if (drc->drc_keynvl != NULL) { nvlist_free(drc->drc_keynvl); drc->drc_keynvl = NULL; } } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head)); if (drc->drc_force) { /* * Destroy any snapshots of drc_tofs (origin_head) * after the origin (the snap before drc_ds). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &snap)); ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_destroy_snapshot_sync_impl(snap, B_FALSE, tx); dsl_dataset_rele(snap, FTAG); } } if (drc->drc_keynvl != NULL) { dsl_crypto_recv_raw_key_sync(drc->drc_ds, drc->drc_keynvl, tx); nvlist_free(drc->drc_keynvl); drc->drc_keynvl = NULL; } VERIFY3P(drc->drc_ds->ds_prev, ==, origin_head->ds_prev); dsl_dataset_clone_swap_sync_impl(drc->drc_ds, origin_head, tx); /* * The objset was evicted by dsl_dataset_clone_swap_sync_impl, * so drc_os is no longer valid. */ drc->drc_os = NULL; dsl_dataset_snapshot_sync_impl(origin_head, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(origin_head->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(origin_head->ds_dbuf, tx); dsl_dataset_phys(origin_head)->ds_flags &= ~DS_FLAG_INCONSISTENT; newsnapobj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; dsl_dataset_rele(origin_head, FTAG); dsl_destroy_head_sync_impl(drc->drc_ds, tx); if (drc->drc_owner != NULL) VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); } else { dsl_dataset_t *ds = drc->drc_ds; dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); dsl_dataset_phys(ds->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(ds->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(ds->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; if (dsl_dataset_has_resume_receive_state(ds)) { (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OBJECT, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OFFSET, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_BYTES, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TONAME, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx); } newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; } /* * If this is a raw receive, the crypt_keydata nvlist will include * a to_ivset_guid for us to set on the new snapshot. This value * will override the value generated by the snapshot code. However, * this value may not be present, because older implementations of * the raw send code did not include this value, and we are still * allowed to receive them if the zfs_disable_ivset_guid_check * tunable is set, in which case we will leave the newly-generated * value. */ if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) { dmu_object_zapify(dp->dp_meta_objset, newsnapobj, DMU_OT_DSL_DATASET, tx); VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj, DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, &drc->drc_ivset_guid, tx)); } /* * Release the hold from dmu_recv_begin. This must be done before * we return to open context, so that when we free the dataset's dnode * we can evict its bonus buffer. Since the dataset may be destroyed * at this point (and therefore won't have a valid pointer to the spa) * we release the key mapping manually here while we do have a valid * pointer, if it exists. */ if (!drc->drc_raw && encrypted) { (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa, drc->drc_ds->ds_object, drc->drc_ds); } dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag); drc->drc_ds = NULL; } static int dmu_recv_end_modified_blocks = 3; static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { #ifdef _KERNEL /* * We will be destroying the ds; make sure its origin is unmounted if * necessary. */ char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(drc->drc_ds, name); zfs_destroy_unmount_origin(name); #endif return (dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); } static int dmu_recv_new_end(dmu_recv_cookie_t *drc) { return (dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); } int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) { int error; drc->drc_owner = owner; if (drc->drc_newfs) error = dmu_recv_new_end(drc); else error = dmu_recv_existing_end(drc); if (error != 0) { dmu_recv_cleanup_ds(drc); nvlist_free(drc->drc_keynvl); } else if (!drc->drc_heal) { if (drc->drc_newfs) { zvol_create_minor(drc->drc_tofs); } char *snapname = kmem_asprintf("%s@%s", drc->drc_tofs, drc->drc_tosnap); zvol_create_minor(snapname); kmem_strfree(snapname); } return (error); } /* * Return TRUE if this objset is currently being received into. */ boolean_t dmu_objset_is_receiving(objset_t *os) { return (os->os_dsl_dataset != NULL && os->os_dsl_dataset->ds_owner == dmu_recv_tag); } ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, UINT, ZMOD_RW, "Maximum receive queue length"); ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, UINT, ZMOD_RW, "Receive queue fill fraction"); ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, UINT, ZMOD_RW, "Maximum amount of writes to batch into one transaction"); ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW, "Ignore errors during corrective receive"); /* END CSTYLED */ diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index c7d3a5cb6e7f..a174972e9b57 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1,3126 +1,3137 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2016 Actifio, Inc. All rights reserved. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ static int zfs_send_corrupt_data = B_FALSE; /* * This tunable controls the amount of data (measured in bytes) that will be * prefetched by zfs send. If the main thread is blocking on reads that haven't * completed, this variable might need to be increased. If instead the main * thread is issuing new reads because the prefetches have fallen out of the * cache, this may need to be decreased. */ static uint_t zfs_send_queue_length = SPA_MAXBLOCKSIZE; /* * This tunable controls the length of the queues that zfs send worker threads * use to communicate. If the send_main_thread is blocking on these queues, * this variable may need to be increased. If there is a significant slowdown * at the start of a send as these threads consume all the available IO * resources, this variable may need to be decreased. */ static uint_t zfs_send_no_prefetch_queue_length = 1024 * 1024; /* * These tunables control the fill fraction of the queues by zfs send. The fill * fraction controls the frequency with which threads have to be cv_signaled. * If a lot of cpu time is being spent on cv_signal, then these should be tuned * down. If the queues empty before the signalled thread can catch up, then * these should be tuned up. */ static uint_t zfs_send_queue_ff = 20; static uint_t zfs_send_no_prefetch_queue_ff = 20; /* * Use this to override the recordsize calculation for fast zfs send estimates. */ static uint_t zfs_override_estimate_recordsize = 0; /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ static const boolean_t zfs_send_set_freerecords_bit = B_TRUE; /* Set this tunable to FALSE is disable sending unmodified spill blocks. */ static int zfs_send_unmodified_spill_blocks = B_TRUE; static inline boolean_t overflow_multiply(uint64_t a, uint64_t b, uint64_t *c) { uint64_t temp = a * b; if (b != 0 && temp / b != a) return (B_FALSE); *c = temp; return (B_TRUE); } struct send_thread_arg { bqueue_t q; objset_t *os; /* Objset to traverse */ uint64_t fromtxg; /* Traverse from this txg */ int flags; /* flags to pass to traverse_dataset */ int error_code; boolean_t cancel; zbookmark_phys_t resume; uint64_t *num_blocks_visited; }; struct redact_list_thread_arg { boolean_t cancel; bqueue_t q; zbookmark_phys_t resume; redaction_list_t *rl; boolean_t mark_redact; int error_code; uint64_t *num_blocks_visited; }; struct send_merge_thread_arg { bqueue_t q; objset_t *os; struct redact_list_thread_arg *from_arg; struct send_thread_arg *to_arg; struct redact_list_thread_arg *redact_arg; int error; boolean_t cancel; }; struct send_range { boolean_t eos_marker; /* Marks the end of the stream */ uint64_t object; uint64_t start_blkid; uint64_t end_blkid; bqueue_node_t ln; enum type {DATA, HOLE, OBJECT, OBJECT_RANGE, REDACT, PREVIOUSLY_REDACTED} type; union { struct srd { dmu_object_type_t obj_type; uint32_t datablksz; // logical size uint32_t datasz; // payload size blkptr_t bp; arc_buf_t *abuf; abd_t *abd; kmutex_t lock; kcondvar_t cv; boolean_t io_outstanding; boolean_t io_compressed; int io_err; } data; struct srh { uint32_t datablksz; } hole; struct sro { /* * This is a pointer because embedding it in the * struct causes these structures to be massively larger * for all range types; this makes the code much less * memory efficient. */ dnode_phys_t *dnp; blkptr_t bp; } object; struct srr { uint32_t datablksz; } redact; struct sror { blkptr_t bp; } object_range; } sru; }; /* * The list of data whose inclusion in a send stream can be pending from * one call to backup_cb to another. Multiple calls to dump_free(), * dump_freeobjects(), and dump_redact() can be aggregated into a single * DRR_FREE, DRR_FREEOBJECTS, or DRR_REDACT replay record. */ typedef enum { PENDING_NONE, PENDING_FREE, PENDING_FREEOBJECTS, PENDING_REDACT } dmu_pendop_t; typedef struct dmu_send_cookie { dmu_replay_record_t *dsc_drr; dmu_send_outparams_t *dsc_dso; offset_t *dsc_off; objset_t *dsc_os; zio_cksum_t dsc_zc; uint64_t dsc_toguid; uint64_t dsc_fromtxg; int dsc_err; dmu_pendop_t dsc_pending_op; uint64_t dsc_featureflags; uint64_t dsc_last_data_object; uint64_t dsc_last_data_offset; uint64_t dsc_resume_object; uint64_t dsc_resume_offset; boolean_t dsc_sent_begin; boolean_t dsc_sent_end; } dmu_send_cookie_t; static int do_dump(dmu_send_cookie_t *dscp, struct send_range *range); static void range_free(struct send_range *range) { if (range->type == OBJECT) { size_t size = sizeof (dnode_phys_t) * (range->sru.object.dnp->dn_extra_slots + 1); kmem_free(range->sru.object.dnp, size); } else if (range->type == DATA) { mutex_enter(&range->sru.data.lock); while (range->sru.data.io_outstanding) cv_wait(&range->sru.data.cv, &range->sru.data.lock); if (range->sru.data.abd != NULL) abd_free(range->sru.data.abd); if (range->sru.data.abuf != NULL) { arc_buf_destroy(range->sru.data.abuf, &range->sru.data.abuf); } mutex_exit(&range->sru.data.lock); cv_destroy(&range->sru.data.cv); mutex_destroy(&range->sru.data.lock); } kmem_free(range, sizeof (*range)); } /* * For all record types except BEGIN, fill in the checksum (overlaid in * drr_u.drr_checksum.drr_checksum). The checksum verifies everything * up to the start of the checksum itself. */ static int dump_record(dmu_send_cookie_t *dscp, void *payload, int payload_len) { dmu_send_outparams_t *dso = dscp->dsc_dso; ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); (void) fletcher_4_incremental_native(dscp->dsc_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &dscp->dsc_zc); if (dscp->dsc_drr->drr_type == DRR_BEGIN) { dscp->dsc_sent_begin = B_TRUE; } else { ASSERT(ZIO_CHECKSUM_IS_ZERO(&dscp->dsc_drr->drr_u. drr_checksum.drr_checksum)); dscp->dsc_drr->drr_u.drr_checksum.drr_checksum = dscp->dsc_zc; } if (dscp->dsc_drr->drr_type == DRR_END) { dscp->dsc_sent_end = B_TRUE; } (void) fletcher_4_incremental_native(&dscp->dsc_drr-> drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), &dscp->dsc_zc); *dscp->dsc_off += sizeof (dmu_replay_record_t); dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, dscp->dsc_drr, sizeof (dmu_replay_record_t), dso->dso_arg); if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); if (payload_len != 0) { *dscp->dsc_off += payload_len; /* * payload is null when dso_dryrun == B_TRUE (i.e. when we're * doing a send size calculation) */ if (payload != NULL) { (void) fletcher_4_incremental_native( payload, payload_len, &dscp->dsc_zc); } /* * The code does not rely on this (len being a multiple of 8). * We keep this assertion because of the corresponding assertion * in receive_read(). Keeping this assertion ensures that we do * not inadvertently break backwards compatibility (causing the * assertion in receive_read() to trigger on old software). * * Raw sends cannot be received on old software, and so can * bypass this assertion. */ ASSERT((payload_len % 8 == 0) || (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)); dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, payload, payload_len, dso->dso_arg); if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); } return (0); } /* * Fill in the drr_free struct, or perform aggregation if the previous record is * also a free record, and the two are adjacent. * * Note that we send free records even for a full send, because we want to be * able to receive a full send as a clone, which requires a list of all the free * and freeobject records that were generated on the source. */ static int dump_free(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, uint64_t length) { struct drr_free *drrf = &(dscp->dsc_drr->drr_u.drr_free); /* * When we receive a free record, dbuf_free_range() assumes * that the receiving system doesn't have any dbufs in the range * being freed. This is always true because there is a one-record * constraint: we only send one WRITE record for any given * object,offset. We know that the one-record constraint is * true because we always send data in increasing order by * object,offset. * * If the increasing-order constraint ever changes, we should find * another way to assert that the one-record constraint is still * satisfied. */ ASSERT(object > dscp->dsc_last_data_object || (object == dscp->dsc_last_data_object && offset > dscp->dsc_last_data_offset)); /* * If there is a pending op, but it's not PENDING_FREE, push it out, * since free block aggregation can only be done for blocks of the * same type (i.e., DRR_FREE records can only be aggregated with * other DRR_FREE records. DRR_FREEOBJECTS records can only be * aggregated with other DRR_FREEOBJECTS records). */ if (dscp->dsc_pending_op != PENDING_NONE && dscp->dsc_pending_op != PENDING_FREE) { if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dscp->dsc_pending_op = PENDING_NONE; } if (dscp->dsc_pending_op == PENDING_FREE) { /* * Check to see whether this free block can be aggregated * with pending one. */ if (drrf->drr_object == object && drrf->drr_offset + drrf->drr_length == offset) { if (offset + length < offset || length == UINT64_MAX) drrf->drr_length = UINT64_MAX; else drrf->drr_length += length; return (0); } else { /* not a continuation. Push out pending record */ if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dscp->dsc_pending_op = PENDING_NONE; } } /* create a FREE record and make it pending */ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; if (offset + length < offset) drrf->drr_length = DMU_OBJECT_END; else drrf->drr_length = length; drrf->drr_toguid = dscp->dsc_toguid; if (length == DMU_OBJECT_END) { if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); } else { dscp->dsc_pending_op = PENDING_FREE; } return (0); } /* * Fill in the drr_redact struct, or perform aggregation if the previous record * is also a redaction record, and the two are adjacent. */ static int dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, uint64_t length) { struct drr_redact *drrr = &dscp->dsc_drr->drr_u.drr_redact; /* * If there is a pending op, but it's not PENDING_REDACT, push it out, * since free block aggregation can only be done for blocks of the * same type (i.e., DRR_REDACT records can only be aggregated with * other DRR_REDACT records). */ if (dscp->dsc_pending_op != PENDING_NONE && dscp->dsc_pending_op != PENDING_REDACT) { if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dscp->dsc_pending_op = PENDING_NONE; } if (dscp->dsc_pending_op == PENDING_REDACT) { /* * Check to see whether this redacted block can be aggregated * with pending one. */ if (drrr->drr_object == object && drrr->drr_offset + drrr->drr_length == offset) { drrr->drr_length += length; return (0); } else { /* not a continuation. Push out pending record */ if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dscp->dsc_pending_op = PENDING_NONE; } } /* create a REDACT record and make it pending */ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_REDACT; drrr->drr_object = object; drrr->drr_offset = offset; drrr->drr_length = length; drrr->drr_toguid = dscp->dsc_toguid; dscp->dsc_pending_op = PENDING_REDACT; return (0); } static int dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, boolean_t io_compressed, void *data) { uint64_t payload_size; boolean_t raw = (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW); struct drr_write *drrw = &(dscp->dsc_drr->drr_u.drr_write); /* * We send data in increasing object, offset order. * See comment in dump_free() for details. */ ASSERT(object > dscp->dsc_last_data_object || (object == dscp->dsc_last_data_object && offset > dscp->dsc_last_data_offset)); dscp->dsc_last_data_object = object; dscp->dsc_last_data_offset = offset + lsize - 1; /* * If there is any kind of pending aggregation (currently either * a grouping of free objects or free blocks), push it out to * the stream, since aggregation can't be done across operations * of different types. */ if (dscp->dsc_pending_op != PENDING_NONE) { if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dscp->dsc_pending_op = PENDING_NONE; } /* write a WRITE record */ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_WRITE; drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; drrw->drr_toguid = dscp->dsc_toguid; drrw->drr_logical_size = lsize; /* only set the compression fields if the buf is compressed or raw */ boolean_t compressed = (bp != NULL ? BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && io_compressed : lsize != psize); if (raw || compressed) { ASSERT(bp != NULL); ASSERT(raw || dscp->dsc_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3S(psize, >, 0); if (raw) { ASSERT(BP_IS_PROTECTED(bp)); /* * This is a raw protected block so we need to pass * along everything the receiving side will need to * interpret this block, including the byteswap, salt, * IV, and MAC. */ if (BP_SHOULD_BYTESWAP(bp)) drrw->drr_flags |= DRR_RAW_BYTESWAP; zio_crypt_decode_params_bp(bp, drrw->drr_salt, drrw->drr_iv); zio_crypt_decode_mac_bp(bp, drrw->drr_mac); } else { /* this is a compressed block */ ASSERT(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); ASSERT3S(lsize, >=, psize); } /* set fields common to compressed and raw sends */ drrw->drr_compressiontype = BP_GET_COMPRESS(bp); drrw->drr_compressed_size = psize; payload_size = drrw->drr_compressed_size; } else { payload_size = drrw->drr_logical_size; } if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) { /* * There's no pre-computed checksum for partial-block writes, * embedded BP's, or encrypted BP's that are being sent as * plaintext, so (like fletcher4-checksummed blocks) userland * will have to compute a dedup-capable checksum itself. */ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; } else { drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & ZCHECKSUM_FLAG_DEDUP) drrw->drr_flags |= DRR_CHECKSUM_DEDUP; DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp)); drrw->drr_key.ddk_cksum = bp->blk_cksum; } if (dump_record(dscp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp) { char buf[BPE_PAYLOAD_SIZE]; struct drr_write_embedded *drrw = &(dscp->dsc_drr->drr_u.drr_write_embedded); if (dscp->dsc_pending_op != PENDING_NONE) { if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dscp->dsc_pending_op = PENDING_NONE; } ASSERT(BP_IS_EMBEDDED(bp)); memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_WRITE_EMBEDDED; drrw->drr_object = object; drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = dscp->dsc_toguid; drrw->drr_compression = BP_GET_COMPRESS(bp); drrw->drr_etype = BPE_GET_ETYPE(bp); drrw->drr_lsize = BPE_GET_LSIZE(bp); drrw->drr_psize = BPE_GET_PSIZE(bp); decode_embedded_bp_compressed(bp, buf); uint32_t psize = drrw->drr_psize; uint32_t rsize = P2ROUNDUP(psize, 8); if (psize != rsize) memset(buf + psize, 0, rsize - psize); if (dump_record(dscp, buf, rsize) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, void *data) { struct drr_spill *drrs = &(dscp->dsc_drr->drr_u.drr_spill); uint64_t blksz = BP_GET_LSIZE(bp); uint64_t payload_size = blksz; if (dscp->dsc_pending_op != PENDING_NONE) { if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dscp->dsc_pending_op = PENDING_NONE; } /* write a SPILL record */ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_SPILL; drrs->drr_object = object; drrs->drr_length = blksz; drrs->drr_toguid = dscp->dsc_toguid; /* See comment in dump_dnode() for full details */ if (zfs_send_unmodified_spill_blocks && (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) { drrs->drr_flags |= DRR_SPILL_UNMODIFIED; } /* handle raw send fields */ if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { ASSERT(BP_IS_PROTECTED(bp)); if (BP_SHOULD_BYTESWAP(bp)) drrs->drr_flags |= DRR_RAW_BYTESWAP; drrs->drr_compressiontype = BP_GET_COMPRESS(bp); drrs->drr_compressed_size = BP_GET_PSIZE(bp); zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv); zio_crypt_decode_mac_bp(bp, drrs->drr_mac); payload_size = drrs->drr_compressed_size; } if (dump_record(dscp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_freeobjects(dmu_send_cookie_t *dscp, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(dscp->dsc_drr->drr_u.drr_freeobjects); uint64_t maxobj = DNODES_PER_BLOCK * (DMU_META_DNODE(dscp->dsc_os)->dn_maxblkid + 1); /* * ZoL < 0.7 does not handle large FREEOBJECTS records correctly, * leading to zfs recv never completing. to avoid this issue, don't * send FREEOBJECTS records for object IDs which cannot exist on the * receiving side. */ if (maxobj > 0) { if (maxobj <= firstobj) return (0); if (maxobj < firstobj + numobjs) numobjs = maxobj - firstobj; } /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for * blocks of the same type (i.e., DRR_FREE records can only be * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records * can only be aggregated with other DRR_FREEOBJECTS records). */ if (dscp->dsc_pending_op != PENDING_NONE && dscp->dsc_pending_op != PENDING_FREEOBJECTS) { if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dscp->dsc_pending_op = PENDING_NONE; } if (dscp->dsc_pending_op == PENDING_FREEOBJECTS) { /* * See whether this free object array can be aggregated * with pending one */ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { drrfo->drr_numobjs += numobjs; return (0); } else { /* can't be aggregated. Push out pending record */ if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dscp->dsc_pending_op = PENDING_NONE; } } /* write a FREEOBJECTS record */ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; drrfo->drr_toguid = dscp->dsc_toguid; dscp->dsc_pending_op = PENDING_FREEOBJECTS; return (0); } static int dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, dnode_phys_t *dnp) { struct drr_object *drro = &(dscp->dsc_drr->drr_u.drr_object); int bonuslen; if (object < dscp->dsc_resume_object) { /* * Note: when resuming, we will visit all the dnodes in * the block of dnodes that we are resuming from. In * this case it's unnecessary to send the dnodes prior to * the one we are resuming from. We should be at most one * block's worth of dnodes behind the resume point. */ ASSERT3U(dscp->dsc_resume_object - object, <, 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); return (0); } if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(dscp, object, 1)); if (dscp->dsc_pending_op != PENDING_NONE) { if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dscp->dsc_pending_op = PENDING_NONE; } /* write an OBJECT record */ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; drro->drr_bonustype = dnp->dn_bonustype; drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; drro->drr_bonuslen = dnp->dn_bonuslen; drro->drr_dn_slots = dnp->dn_extra_slots + 1; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; drro->drr_toguid = dscp->dsc_toguid; if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8); if ((dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) { ASSERT(BP_IS_ENCRYPTED(bp)); if (BP_SHOULD_BYTESWAP(bp)) drro->drr_flags |= DRR_RAW_BYTESWAP; /* needed for reconstructing dnp on recv side */ drro->drr_maxblkid = dnp->dn_maxblkid; drro->drr_indblkshift = dnp->dn_indblkshift; drro->drr_nlevels = dnp->dn_nlevels; drro->drr_nblkptr = dnp->dn_nblkptr; /* * Since we encrypt the entire bonus area, the (raw) part * beyond the bonuslen is actually nonzero, so we need * to send it. */ if (bonuslen != 0) { if (drro->drr_bonuslen > DN_MAX_BONUS_LEN(dnp)) return (SET_ERROR(EINVAL)); drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp); bonuslen = drro->drr_raw_bonuslen; } } /* * DRR_OBJECT_SPILL is set for every dnode which references a * spill block. This allows the receiving pool to definitively * determine when a spill block should be kept or freed. */ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) drro->drr_flags |= DRR_OBJECT_SPILL; if (dump_record(dscp, DN_BONUS(dnp), bonuslen) != 0) return (SET_ERROR(EINTR)); /* Free anything past the end of the file. */ if (dump_free(dscp, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) return (SET_ERROR(EINTR)); /* * Send DRR_SPILL records for unmodified spill blocks. This is useful * because changing certain attributes of the object (e.g. blocksize) * can cause old versions of ZFS to incorrectly remove a spill block. * Including these records in the stream forces an up to date version * to always be written ensuring they're never lost. Current versions * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can * ignore these unmodified spill blocks. */ if (zfs_send_unmodified_spill_blocks && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) { struct send_range record; blkptr_t *bp = DN_SPILL_BLKPTR(dnp); memset(&record, 0, sizeof (struct send_range)); record.type = DATA; record.object = object; record.eos_marker = B_FALSE; record.start_blkid = DMU_SPILL_BLKID; record.end_blkid = record.start_blkid + 1; record.sru.data.bp = *bp; record.sru.data.obj_type = dnp->dn_type; record.sru.data.datablksz = BP_GET_LSIZE(bp); if (do_dump(dscp, &record) != 0) return (SET_ERROR(EINTR)); } if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_object_range(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t firstobj, uint64_t numslots) { struct drr_object_range *drror = &(dscp->dsc_drr->drr_u.drr_object_range); /* we only use this record type for raw sends */ ASSERT(BP_IS_PROTECTED(bp)); ASSERT(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE); ASSERT0(BP_GET_LEVEL(bp)); if (dscp->dsc_pending_op != PENDING_NONE) { if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dscp->dsc_pending_op = PENDING_NONE; } memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_OBJECT_RANGE; drror->drr_firstobj = firstobj; drror->drr_numslots = numslots; drror->drr_toguid = dscp->dsc_toguid; if (BP_SHOULD_BYTESWAP(bp)) drror->drr_flags |= DRR_RAW_BYTESWAP; zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv); zio_crypt_decode_mac_bp(bp, drror->drr_mac); if (dump_record(dscp, NULL, 0) != 0) return (SET_ERROR(EINTR)); return (0); } static boolean_t send_do_embed(const blkptr_t *bp, uint64_t featureflags) { if (!BP_IS_EMBEDDED(bp)) return (B_FALSE); /* * Compression function must be legacy, or explicitly enabled. */ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && !(featureflags & DMU_BACKUP_FEATURE_LZ4))) return (B_FALSE); /* * If we have not set the ZSTD feature flag, we can't send ZSTD * compressed embedded blocks, as the receiver may not support them. */ if ((BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD && !(featureflags & DMU_BACKUP_FEATURE_ZSTD))) return (B_FALSE); /* * Embed type must be explicitly enabled. */ switch (BPE_GET_ETYPE(bp)) { case BP_EMBEDDED_TYPE_DATA: if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) return (B_TRUE); break; default: return (B_FALSE); } return (B_FALSE); } /* * This function actually handles figuring out what kind of record needs to be * dumped, and calling the appropriate helper function. In most cases, * the data has already been read by send_reader_thread(). */ static int do_dump(dmu_send_cookie_t *dscp, struct send_range *range) { int err = 0; switch (range->type) { case OBJECT: err = dump_dnode(dscp, &range->sru.object.bp, range->object, range->sru.object.dnp); return (err); case OBJECT_RANGE: { ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) { return (0); } uint64_t epb = BP_GET_LSIZE(&range->sru.object_range.bp) >> DNODE_SHIFT; uint64_t firstobj = range->start_blkid * epb; err = dump_object_range(dscp, &range->sru.object_range.bp, firstobj, epb); break; } case REDACT: { struct srr *srrp = &range->sru.redact; err = dump_redact(dscp, range->object, range->start_blkid * srrp->datablksz, (range->end_blkid - range->start_blkid) * srrp->datablksz); return (err); } case DATA: { struct srd *srdp = &range->sru.data; blkptr_t *bp = &srdp->bp; spa_t *spa = dmu_objset_spa(dscp->dsc_os); ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp)); ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); if (BP_GET_TYPE(bp) == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; zio_flag_t zioflags = ZIO_FLAG_CANFAIL; if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { ASSERT(BP_IS_PROTECTED(bp)); zioflags |= ZIO_FLAG_RAW; } zbookmark_phys_t zb; ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); zb.zb_objset = dmu_objset_id(dscp->dsc_os); zb.zb_object = range->object; zb.zb_level = 0; zb.zb_blkid = range->start_blkid; arc_buf_t *abuf = NULL; if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, &zb) != 0) return (SET_ERROR(EIO)); err = dump_spill(dscp, bp, zb.zb_object, (abuf == NULL ? NULL : abuf->b_data)); if (abuf != NULL) arc_buf_destroy(abuf, &abuf); return (err); } if (send_do_embed(bp, dscp->dsc_featureflags)) { err = dump_write_embedded(dscp, range->object, range->start_blkid * srdp->datablksz, srdp->datablksz, bp); return (err); } ASSERT(range->object > dscp->dsc_resume_object || (range->object == dscp->dsc_resume_object && range->start_blkid * srdp->datablksz >= dscp->dsc_resume_offset)); /* it's a level-0 block of a regular object */ mutex_enter(&srdp->lock); while (srdp->io_outstanding) cv_wait(&srdp->cv, &srdp->lock); err = srdp->io_err; mutex_exit(&srdp->lock); if (err != 0) { if (zfs_send_corrupt_data && !dscp->dsc_dso->dso_dryrun) { /* * Send a block filled with 0x"zfs badd bloc" */ srdp->abuf = arc_alloc_buf(spa, &srdp->abuf, ARC_BUFC_DATA, srdp->datablksz); uint64_t *ptr; for (ptr = srdp->abuf->b_data; (char *)ptr < (char *)srdp->abuf->b_data + srdp->datablksz; ptr++) *ptr = 0x2f5baddb10cULL; } else { return (SET_ERROR(EIO)); } } ASSERT(dscp->dsc_dso->dso_dryrun || srdp->abuf != NULL || srdp->abd != NULL); uint64_t offset = range->start_blkid * srdp->datablksz; char *data = NULL; if (srdp->abd != NULL) { data = abd_to_buf(srdp->abd); ASSERT3P(srdp->abuf, ==, NULL); } else if (srdp->abuf != NULL) { data = srdp->abuf->b_data; } /* * If we have large blocks stored on disk but the send flags * don't allow us to send large blocks, we split the data from * the arc buf into chunks. */ if (srdp->datablksz > SPA_OLD_MAXBLOCKSIZE && !(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) { while (srdp->datablksz > 0 && err == 0) { int n = MIN(srdp->datablksz, SPA_OLD_MAXBLOCKSIZE); err = dmu_dump_write(dscp, srdp->obj_type, range->object, offset, n, n, NULL, B_FALSE, data); offset += n; /* * When doing dry run, data==NULL is used as a * sentinel value by * dmu_dump_write()->dump_record(). */ if (data != NULL) data += n; srdp->datablksz -= n; } } else { err = dmu_dump_write(dscp, srdp->obj_type, range->object, offset, srdp->datablksz, srdp->datasz, bp, srdp->io_compressed, data); } return (err); } case HOLE: { struct srh *srhp = &range->sru.hole; if (range->object == DMU_META_DNODE_OBJECT) { uint32_t span = srhp->datablksz >> DNODE_SHIFT; uint64_t first_obj = range->start_blkid * span; uint64_t numobj = range->end_blkid * span - first_obj; return (dump_freeobjects(dscp, first_obj, numobj)); } uint64_t offset = 0; /* * If this multiply overflows, we don't need to send this block. * Even if it has a birth time, it can never not be a hole, so * we don't need to send records for it. */ if (!overflow_multiply(range->start_blkid, srhp->datablksz, &offset)) { return (0); } uint64_t len = 0; if (!overflow_multiply(range->end_blkid, srhp->datablksz, &len)) len = UINT64_MAX; len = len - offset; return (dump_free(dscp, range->object, offset, len)); } default: panic("Invalid range type in do_dump: %d", range->type); } return (err); } static struct send_range * range_alloc(enum type type, uint64_t object, uint64_t start_blkid, uint64_t end_blkid, boolean_t eos) { struct send_range *range = kmem_alloc(sizeof (*range), KM_SLEEP); range->type = type; range->object = object; range->start_blkid = start_blkid; range->end_blkid = end_blkid; range->eos_marker = eos; if (type == DATA) { range->sru.data.abd = NULL; range->sru.data.abuf = NULL; mutex_init(&range->sru.data.lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&range->sru.data.cv, NULL, CV_DEFAULT, NULL); range->sru.data.io_outstanding = 0; range->sru.data.io_err = 0; range->sru.data.io_compressed = B_FALSE; } return (range); } /* * This is the callback function to traverse_dataset that acts as a worker * thread for dmu_send_impl. */ static int send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) { (void) zilog; struct send_thread_arg *sta = arg; struct send_range *record; ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= sta->resume.zb_object); /* * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ if (sta->os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); return (SET_ERROR(EIO)); } if (sta->cancel) return (SET_ERROR(EINTR)); if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) return (0); atomic_inc_64(sta->num_blocks_visited); if (zb->zb_level == ZB_DNODE_LEVEL) { if (zb->zb_object == DMU_META_DNODE_OBJECT) return (0); record = range_alloc(OBJECT, zb->zb_object, 0, 0, B_FALSE); record->sru.object.bp = *bp; size_t size = sizeof (*dnp) * (dnp->dn_extra_slots + 1); record->sru.object.dnp = kmem_alloc(size, KM_SLEEP); memcpy(record->sru.object.dnp, dnp, size); bqueue_enqueue(&sta->q, record, sizeof (*record)); return (0); } if (zb->zb_level == 0 && zb->zb_object == DMU_META_DNODE_OBJECT && !BP_IS_HOLE(bp)) { record = range_alloc(OBJECT_RANGE, 0, zb->zb_blkid, zb->zb_blkid + 1, B_FALSE); record->sru.object_range.bp = *bp; bqueue_enqueue(&sta->q, record, sizeof (*record)); return (0); } if (zb->zb_level < 0 || (zb->zb_level > 0 && !BP_IS_HOLE(bp))) return (0); if (zb->zb_object == DMU_META_DNODE_OBJECT && !BP_IS_HOLE(bp)) return (0); uint64_t span = bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level); uint64_t start; /* * If this multiply overflows, we don't need to send this block. * Even if it has a birth time, it can never not be a hole, so * we don't need to send records for it. */ if (!overflow_multiply(span, zb->zb_blkid, &start) || (!(zb->zb_blkid == DMU_SPILL_BLKID || DMU_OT_IS_METADATA(dnp->dn_type)) && span * zb->zb_blkid > dnp->dn_maxblkid)) { ASSERT(BP_IS_HOLE(bp)); return (0); } if (zb->zb_blkid == DMU_SPILL_BLKID) ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA); enum type record_type = DATA; if (BP_IS_HOLE(bp)) record_type = HOLE; else if (BP_IS_REDACTED(bp)) record_type = REDACT; else record_type = DATA; record = range_alloc(record_type, zb->zb_object, start, (start + span < start ? 0 : start + span), B_FALSE); uint64_t datablksz = (zb->zb_blkid == DMU_SPILL_BLKID ? BP_GET_LSIZE(bp) : dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); if (BP_IS_HOLE(bp)) { record->sru.hole.datablksz = datablksz; } else if (BP_IS_REDACTED(bp)) { record->sru.redact.datablksz = datablksz; } else { record->sru.data.datablksz = datablksz; record->sru.data.obj_type = dnp->dn_type; record->sru.data.bp = *bp; } bqueue_enqueue(&sta->q, record, sizeof (*record)); return (0); } struct redact_list_cb_arg { uint64_t *num_blocks_visited; bqueue_t *q; boolean_t *cancel; boolean_t mark_redact; }; static int redact_list_cb(redact_block_phys_t *rb, void *arg) { struct redact_list_cb_arg *rlcap = arg; atomic_inc_64(rlcap->num_blocks_visited); if (*rlcap->cancel) return (-1); struct send_range *data = range_alloc(REDACT, rb->rbp_object, rb->rbp_blkid, rb->rbp_blkid + redact_block_get_count(rb), B_FALSE); ASSERT3U(data->end_blkid, >, rb->rbp_blkid); if (rlcap->mark_redact) { data->type = REDACT; data->sru.redact.datablksz = redact_block_get_size(rb); } else { data->type = PREVIOUSLY_REDACTED; } bqueue_enqueue(rlcap->q, data, sizeof (*data)); return (0); } /* * This function kicks off the traverse_dataset. It also handles setting the * error code of the thread in case something goes wrong, and pushes the End of * Stream record when the traverse_dataset call has finished. */ static __attribute__((noreturn)) void send_traverse_thread(void *arg) { struct send_thread_arg *st_arg = arg; int err = 0; struct send_range *data; fstrans_cookie_t cookie = spl_fstrans_mark(); err = traverse_dataset_resume(st_arg->os->os_dsl_dataset, st_arg->fromtxg, &st_arg->resume, st_arg->flags, send_cb, st_arg); if (err != EINTR) st_arg->error_code = err; data = range_alloc(DATA, 0, 0, 0, B_TRUE); bqueue_enqueue_flush(&st_arg->q, data, sizeof (*data)); spl_fstrans_unmark(cookie); thread_exit(); } /* * Utility function that causes End of Stream records to compare after of all * others, so that other threads' comparison logic can stay simple. */ static int __attribute__((unused)) send_range_after(const struct send_range *from, const struct send_range *to) { if (from->eos_marker == B_TRUE) return (1); if (to->eos_marker == B_TRUE) return (-1); uint64_t from_obj = from->object; uint64_t from_end_obj = from->object + 1; uint64_t to_obj = to->object; uint64_t to_end_obj = to->object + 1; if (from_obj == 0) { ASSERT(from->type == HOLE || from->type == OBJECT_RANGE); from_obj = from->start_blkid << DNODES_PER_BLOCK_SHIFT; from_end_obj = from->end_blkid << DNODES_PER_BLOCK_SHIFT; } if (to_obj == 0) { ASSERT(to->type == HOLE || to->type == OBJECT_RANGE); to_obj = to->start_blkid << DNODES_PER_BLOCK_SHIFT; to_end_obj = to->end_blkid << DNODES_PER_BLOCK_SHIFT; } if (from_end_obj <= to_obj) return (-1); if (from_obj >= to_end_obj) return (1); int64_t cmp = TREE_CMP(to->type == OBJECT_RANGE, from->type == OBJECT_RANGE); if (unlikely(cmp)) return (cmp); cmp = TREE_CMP(to->type == OBJECT, from->type == OBJECT); if (unlikely(cmp)) return (cmp); if (from->end_blkid <= to->start_blkid) return (-1); if (from->start_blkid >= to->end_blkid) return (1); return (0); } /* * Pop the new data off the queue, check that the records we receive are in * the right order, but do not free the old data. This is used so that the * records can be sent on to the main thread without copying the data. */ static struct send_range * get_next_range_nofree(bqueue_t *bq, struct send_range *prev) { struct send_range *next = bqueue_dequeue(bq); ASSERT3S(send_range_after(prev, next), ==, -1); return (next); } /* * Pop the new data off the queue, check that the records we receive are in * the right order, and free the old data. */ static struct send_range * get_next_range(bqueue_t *bq, struct send_range *prev) { struct send_range *next = get_next_range_nofree(bq, prev); range_free(prev); return (next); } static __attribute__((noreturn)) void redact_list_thread(void *arg) { struct redact_list_thread_arg *rlt_arg = arg; struct send_range *record; fstrans_cookie_t cookie = spl_fstrans_mark(); if (rlt_arg->rl != NULL) { struct redact_list_cb_arg rlcba = {0}; rlcba.cancel = &rlt_arg->cancel; rlcba.q = &rlt_arg->q; rlcba.num_blocks_visited = rlt_arg->num_blocks_visited; rlcba.mark_redact = rlt_arg->mark_redact; int err = dsl_redaction_list_traverse(rlt_arg->rl, &rlt_arg->resume, redact_list_cb, &rlcba); if (err != EINTR) rlt_arg->error_code = err; } record = range_alloc(DATA, 0, 0, 0, B_TRUE); bqueue_enqueue_flush(&rlt_arg->q, record, sizeof (*record)); spl_fstrans_unmark(cookie); thread_exit(); } /* * Compare the start point of the two provided ranges. End of stream ranges * compare last, objects compare before any data or hole inside that object and * multi-object holes that start at the same object. */ static int send_range_start_compare(struct send_range *r1, struct send_range *r2) { uint64_t r1_objequiv = r1->object; uint64_t r1_l0equiv = r1->start_blkid; uint64_t r2_objequiv = r2->object; uint64_t r2_l0equiv = r2->start_blkid; int64_t cmp = TREE_CMP(r1->eos_marker, r2->eos_marker); if (unlikely(cmp)) return (cmp); if (r1->object == 0) { r1_objequiv = r1->start_blkid * DNODES_PER_BLOCK; r1_l0equiv = 0; } if (r2->object == 0) { r2_objequiv = r2->start_blkid * DNODES_PER_BLOCK; r2_l0equiv = 0; } cmp = TREE_CMP(r1_objequiv, r2_objequiv); if (likely(cmp)) return (cmp); cmp = TREE_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE); if (unlikely(cmp)) return (cmp); cmp = TREE_CMP(r2->type == OBJECT, r1->type == OBJECT); if (unlikely(cmp)) return (cmp); return (TREE_CMP(r1_l0equiv, r2_l0equiv)); } enum q_idx { REDACT_IDX = 0, TO_IDX, FROM_IDX, NUM_THREADS }; /* * This function returns the next range the send_merge_thread should operate on. * The inputs are two arrays; the first one stores the range at the front of the * queues stored in the second one. The ranges are sorted in descending * priority order; the metadata from earlier ranges overrules metadata from * later ranges. out_mask is used to return which threads the ranges came from; * bit i is set if ranges[i] started at the same place as the returned range. * * This code is not hardcoded to compare a specific number of threads; it could * be used with any number, just by changing the q_idx enum. * * The "next range" is the one with the earliest start; if two starts are equal, * the highest-priority range is the next to operate on. If a higher-priority * range starts in the middle of the first range, then the first range will be * truncated to end where the higher-priority range starts, and we will operate * on that one next time. In this way, we make sure that each block covered by * some range gets covered by a returned range, and each block covered is * returned using the metadata of the highest-priority range it appears in. * * For example, if the three ranges at the front of the queues were [2,4), * [3,5), and [1,3), then the ranges returned would be [1,2) with the metadata * from the third range, [2,4) with the metadata from the first range, and then * [4,5) with the metadata from the second. */ static struct send_range * find_next_range(struct send_range **ranges, bqueue_t **qs, uint64_t *out_mask) { int idx = 0; // index of the range with the earliest start int i; uint64_t bmask = 0; for (i = 1; i < NUM_THREADS; i++) { if (send_range_start_compare(ranges[i], ranges[idx]) < 0) idx = i; } if (ranges[idx]->eos_marker) { struct send_range *ret = range_alloc(DATA, 0, 0, 0, B_TRUE); *out_mask = 0; return (ret); } /* * Find all the ranges that start at that same point. */ for (i = 0; i < NUM_THREADS; i++) { if (send_range_start_compare(ranges[i], ranges[idx]) == 0) bmask |= 1 << i; } *out_mask = bmask; /* * OBJECT_RANGE records only come from the TO thread, and should always * be treated as overlapping with nothing and sent on immediately. They * are only used in raw sends, and are never redacted. */ if (ranges[idx]->type == OBJECT_RANGE) { ASSERT3U(idx, ==, TO_IDX); ASSERT3U(*out_mask, ==, 1 << TO_IDX); struct send_range *ret = ranges[idx]; ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]); return (ret); } /* * Find the first start or end point after the start of the first range. */ uint64_t first_change = ranges[idx]->end_blkid; for (i = 0; i < NUM_THREADS; i++) { if (i == idx || ranges[i]->eos_marker || ranges[i]->object > ranges[idx]->object || ranges[i]->object == DMU_META_DNODE_OBJECT) continue; ASSERT3U(ranges[i]->object, ==, ranges[idx]->object); if (first_change > ranges[i]->start_blkid && (bmask & (1 << i)) == 0) first_change = ranges[i]->start_blkid; else if (first_change > ranges[i]->end_blkid) first_change = ranges[i]->end_blkid; } /* * Update all ranges to no longer overlap with the range we're * returning. All such ranges must start at the same place as the range * being returned, and end at or after first_change. Thus we update * their start to first_change. If that makes them size 0, then free * them and pull a new range from that thread. */ for (i = 0; i < NUM_THREADS; i++) { if (i == idx || (bmask & (1 << i)) == 0) continue; ASSERT3U(first_change, >, ranges[i]->start_blkid); ranges[i]->start_blkid = first_change; ASSERT3U(ranges[i]->start_blkid, <=, ranges[i]->end_blkid); if (ranges[i]->start_blkid == ranges[i]->end_blkid) ranges[i] = get_next_range(qs[i], ranges[i]); } /* * Short-circuit the simple case; if the range doesn't overlap with * anything else, or it only overlaps with things that start at the same * place and are longer, send it on. */ if (first_change == ranges[idx]->end_blkid) { struct send_range *ret = ranges[idx]; ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]); return (ret); } /* * Otherwise, return a truncated copy of ranges[idx] and move the start * of ranges[idx] back to first_change. */ struct send_range *ret = kmem_alloc(sizeof (*ret), KM_SLEEP); *ret = *ranges[idx]; ret->end_blkid = first_change; ranges[idx]->start_blkid = first_change; return (ret); } #define FROM_AND_REDACT_BITS ((1 << REDACT_IDX) | (1 << FROM_IDX)) /* * Merge the results from the from thread and the to thread, and then hand the * records off to send_prefetch_thread to prefetch them. If this is not a * send from a redaction bookmark, the from thread will push an end of stream * record and stop, and we'll just send everything that was changed in the * to_ds since the ancestor's creation txg. If it is, then since * traverse_dataset has a canonical order, we can compare each change as * they're pulled off the queues. That will give us a stream that is * appropriately sorted, and covers all records. In addition, we pull the * data from the redact_list_thread and use that to determine which blocks * should be redacted. */ static __attribute__((noreturn)) void send_merge_thread(void *arg) { struct send_merge_thread_arg *smt_arg = arg; struct send_range *front_ranges[NUM_THREADS]; bqueue_t *queues[NUM_THREADS]; int err = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); if (smt_arg->redact_arg == NULL) { front_ranges[REDACT_IDX] = kmem_zalloc(sizeof (struct send_range), KM_SLEEP); front_ranges[REDACT_IDX]->eos_marker = B_TRUE; front_ranges[REDACT_IDX]->type = REDACT; queues[REDACT_IDX] = NULL; } else { front_ranges[REDACT_IDX] = bqueue_dequeue(&smt_arg->redact_arg->q); queues[REDACT_IDX] = &smt_arg->redact_arg->q; } front_ranges[TO_IDX] = bqueue_dequeue(&smt_arg->to_arg->q); queues[TO_IDX] = &smt_arg->to_arg->q; front_ranges[FROM_IDX] = bqueue_dequeue(&smt_arg->from_arg->q); queues[FROM_IDX] = &smt_arg->from_arg->q; uint64_t mask = 0; struct send_range *range; for (range = find_next_range(front_ranges, queues, &mask); !range->eos_marker && err == 0 && !smt_arg->cancel; range = find_next_range(front_ranges, queues, &mask)) { /* * If the range in question was in both the from redact bookmark * and the bookmark we're using to redact, then don't send it. * It's already redacted on the receiving system, so a redaction * record would be redundant. */ if ((mask & FROM_AND_REDACT_BITS) == FROM_AND_REDACT_BITS) { ASSERT3U(range->type, ==, REDACT); range_free(range); continue; } bqueue_enqueue(&smt_arg->q, range, sizeof (*range)); if (smt_arg->to_arg->error_code != 0) { err = smt_arg->to_arg->error_code; } else if (smt_arg->from_arg->error_code != 0) { err = smt_arg->from_arg->error_code; } else if (smt_arg->redact_arg != NULL && smt_arg->redact_arg->error_code != 0) { err = smt_arg->redact_arg->error_code; } } if (smt_arg->cancel && err == 0) err = SET_ERROR(EINTR); smt_arg->error = err; if (smt_arg->error != 0) { smt_arg->to_arg->cancel = B_TRUE; smt_arg->from_arg->cancel = B_TRUE; if (smt_arg->redact_arg != NULL) smt_arg->redact_arg->cancel = B_TRUE; } for (int i = 0; i < NUM_THREADS; i++) { while (!front_ranges[i]->eos_marker) { front_ranges[i] = get_next_range(queues[i], front_ranges[i]); } range_free(front_ranges[i]); } range->eos_marker = B_TRUE; bqueue_enqueue_flush(&smt_arg->q, range, 1); spl_fstrans_unmark(cookie); thread_exit(); } struct send_reader_thread_arg { struct send_merge_thread_arg *smta; bqueue_t q; boolean_t cancel; boolean_t issue_reads; uint64_t featureflags; int error; }; static void dmu_send_read_done(zio_t *zio) { struct send_range *range = zio->io_private; mutex_enter(&range->sru.data.lock); if (zio->io_error != 0) { abd_free(range->sru.data.abd); range->sru.data.abd = NULL; range->sru.data.io_err = zio->io_error; } ASSERT(range->sru.data.io_outstanding); range->sru.data.io_outstanding = B_FALSE; cv_broadcast(&range->sru.data.cv); mutex_exit(&range->sru.data.lock); } static void issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range) { struct srd *srdp = &range->sru.data; blkptr_t *bp = &srdp->bp; objset_t *os = srta->smta->os; ASSERT3U(range->type, ==, DATA); ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); /* * If we have large blocks stored on disk but * the send flags don't allow us to send large * blocks, we split the data from the arc buf * into chunks. */ boolean_t split_large_blocks = srdp->datablksz > SPA_OLD_MAXBLOCKSIZE && !(srta->featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); /* * We should only request compressed data from the ARC if all * the following are true: * - stream compression was requested * - we aren't splitting large blocks into smaller chunks * - the data won't need to be byteswapped before sending * - this isn't an embedded block * - this isn't metadata (if receiving on a different endian * system it can be byteswapped more easily) */ boolean_t request_compressed = (srta->featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); zio_flag_t zioflags = ZIO_FLAG_CANFAIL; if (srta->featureflags & DMU_BACKUP_FEATURE_RAW) { zioflags |= ZIO_FLAG_RAW; srdp->io_compressed = B_TRUE; } else if (request_compressed) { zioflags |= ZIO_FLAG_RAW_COMPRESS; srdp->io_compressed = B_TRUE; } srdp->datasz = (zioflags & ZIO_FLAG_RAW_COMPRESS) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp); if (!srta->issue_reads) return; if (BP_IS_REDACTED(bp)) return; if (send_do_embed(bp, srta->featureflags)) return; zbookmark_phys_t zb = { .zb_objset = dmu_objset_id(os), .zb_object = range->object, .zb_level = 0, .zb_blkid = range->start_blkid, }; arc_flags_t aflags = ARC_FLAG_CACHED_ONLY; int arc_err = arc_read(NULL, os->os_spa, bp, arc_getbuf_func, &srdp->abuf, ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, &zb); /* * If the data is not already cached in the ARC, we read directly * from zio. This avoids the performance overhead of adding a new * entry to the ARC, and we also avoid polluting the ARC cache with * data that is not likely to be used in the future. */ if (arc_err != 0) { srdp->abd = abd_alloc_linear(srdp->datasz, B_FALSE); srdp->io_outstanding = B_TRUE; zio_nowait(zio_read(NULL, os->os_spa, bp, srdp->abd, srdp->datasz, dmu_send_read_done, range, ZIO_PRIORITY_ASYNC_READ, zioflags, &zb)); } } /* * Create a new record with the given values. */ static void enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn, uint64_t blkid, uint64_t count, const blkptr_t *bp, uint32_t datablksz) { enum type range_type = (bp == NULL || BP_IS_HOLE(bp) ? HOLE : (BP_IS_REDACTED(bp) ? REDACT : DATA)); struct send_range *range = range_alloc(range_type, dn->dn_object, blkid, blkid + count, B_FALSE); if (blkid == DMU_SPILL_BLKID) { ASSERT3P(bp, !=, NULL); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA); } switch (range_type) { case HOLE: range->sru.hole.datablksz = datablksz; break; case DATA: ASSERT3U(count, ==, 1); range->sru.data.datablksz = datablksz; range->sru.data.obj_type = dn->dn_type; range->sru.data.bp = *bp; issue_data_read(srta, range); break; case REDACT: range->sru.redact.datablksz = datablksz; break; default: break; } bqueue_enqueue(q, range, datablksz); } /* * This thread is responsible for two things: First, it retrieves the correct * blkptr in the to ds if we need to send the data because of something from * the from thread. As a result of this, we're the first ones to discover that * some indirect blocks can be discarded because they're not holes. Second, * it issues prefetches for the data we need to send. */ static __attribute__((noreturn)) void send_reader_thread(void *arg) { struct send_reader_thread_arg *srta = arg; struct send_merge_thread_arg *smta = srta->smta; bqueue_t *inq = &smta->q; bqueue_t *outq = &srta->q; objset_t *os = smta->os; fstrans_cookie_t cookie = spl_fstrans_mark(); struct send_range *range = bqueue_dequeue(inq); int err = 0; /* * If the record we're analyzing is from a redaction bookmark from the * fromds, then we need to know whether or not it exists in the tods so * we know whether to create records for it or not. If it does, we need * the datablksz so we can generate an appropriate record for it. * Finally, if it isn't redacted, we need the blkptr so that we can send * a WRITE record containing the actual data. */ uint64_t last_obj = UINT64_MAX; uint64_t last_obj_exists = B_TRUE; while (!range->eos_marker && !srta->cancel && smta->error == 0 && err == 0) { switch (range->type) { case DATA: issue_data_read(srta, range); bqueue_enqueue(outq, range, range->sru.data.datablksz); range = get_next_range_nofree(inq, range); break; case HOLE: case OBJECT: case OBJECT_RANGE: case REDACT: // Redacted blocks must exist bqueue_enqueue(outq, range, sizeof (*range)); range = get_next_range_nofree(inq, range); break; case PREVIOUSLY_REDACTED: { /* * This entry came from the "from bookmark" when * sending from a bookmark that has a redaction * list. We need to check if this object/blkid * exists in the target ("to") dataset, and if * not then we drop this entry. We also need * to fill in the block pointer so that we know * what to prefetch. * * To accomplish the above, we first cache whether or * not the last object we examined exists. If it * doesn't, we can drop this record. If it does, we hold * the dnode and use it to call dbuf_dnode_findbp. We do * this instead of dbuf_bookmark_findbp because we will * often operate on large ranges, and holding the dnode * once is more efficient. */ boolean_t object_exists = B_TRUE; /* * If the data is redacted, we only care if it exists, * so that we don't send records for objects that have * been deleted. */ dnode_t *dn; if (range->object == last_obj && !last_obj_exists) { /* * If we're still examining the same object as * previously, and it doesn't exist, we don't * need to call dbuf_bookmark_findbp. */ object_exists = B_FALSE; } else { err = dnode_hold(os, range->object, FTAG, &dn); if (err == ENOENT) { object_exists = B_FALSE; err = 0; } last_obj = range->object; last_obj_exists = object_exists; } if (err != 0) { break; } else if (!object_exists) { /* * The block was modified, but doesn't * exist in the to dataset; if it was * deleted in the to dataset, then we'll * visit the hole bp for it at some point. */ range = get_next_range(inq, range); continue; } uint64_t file_max = MIN(dn->dn_maxblkid, range->end_blkid); /* * The object exists, so we need to try to find the * blkptr for each block in the range we're processing. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); for (uint64_t blkid = range->start_blkid; blkid < file_max; blkid++) { blkptr_t bp; uint32_t datablksz = dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT; uint64_t offset = blkid * datablksz; /* * This call finds the next non-hole block in * the object. This is to prevent a * performance problem where we're unredacting * a large hole. Using dnode_next_offset to * skip over the large hole avoids iterating * over every block in it. */ err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, &offset, 1, 1, 0); if (err == ESRCH) { offset = UINT64_MAX; err = 0; } else if (err != 0) { break; } if (offset != blkid * datablksz) { /* * if there is a hole from here * (blkid) to offset */ offset = MIN(offset, file_max * datablksz); uint64_t nblks = (offset / datablksz) - blkid; enqueue_range(srta, outq, dn, blkid, nblks, NULL, datablksz); blkid += nblks; } if (blkid >= file_max) break; err = dbuf_dnode_findbp(dn, 0, blkid, &bp, NULL, NULL); if (err != 0) break; ASSERT(!BP_IS_HOLE(&bp)); enqueue_range(srta, outq, dn, blkid, 1, &bp, datablksz); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); range = get_next_range(inq, range); } } } if (srta->cancel || err != 0) { smta->cancel = B_TRUE; srta->error = err; } else if (smta->error != 0) { srta->error = smta->error; } while (!range->eos_marker) range = get_next_range(inq, range); bqueue_enqueue_flush(outq, range, 1); spl_fstrans_unmark(cookie); thread_exit(); } #define NUM_SNAPS_NOT_REDACTED UINT64_MAX struct dmu_send_params { /* Pool args */ const void *tag; // Tag dp was held with, will be used to release dp. dsl_pool_t *dp; /* To snapshot args */ const char *tosnap; dsl_dataset_t *to_ds; /* From snapshot args */ zfs_bookmark_phys_t ancestor_zb; uint64_t *fromredactsnaps; /* NUM_SNAPS_NOT_REDACTED if not sending from redaction bookmark */ uint64_t numfromredactsnaps; /* Stream params */ boolean_t is_clone; boolean_t embedok; boolean_t large_block_ok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t resumeobj; uint64_t resumeoff; uint64_t saved_guid; zfs_bookmark_phys_t *redactbook; /* Stream output params */ dmu_send_outparams_t *dso; /* Stream progress params */ offset_t *off; int outfd; char saved_toname[MAXNAMELEN]; }; static int setup_featureflags(struct dmu_send_params *dspp, objset_t *os, uint64_t *featureflags) { dsl_dataset_t *to_ds = dspp->to_ds; dsl_pool_t *dp = dspp->dp; if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) return (SET_ERROR(EINVAL)); if (version >= ZPL_VERSION_SA) *featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } /* raw sends imply large_block_ok */ if ((dspp->rawok || dspp->large_block_ok) && dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS)) { *featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; } /* encrypted datasets will not have embedded blocks */ if ((dspp->embedok || dspp->rawok) && !os->os_encrypted && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { *featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; } /* raw send implies compressok */ if (dspp->compressok || dspp->rawok) *featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; if (dspp->rawok && os->os_encrypted) *featureflags |= DMU_BACKUP_FEATURE_RAW; if ((*featureflags & (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_RAW)) != 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { *featureflags |= DMU_BACKUP_FEATURE_LZ4; } /* * We specifically do not include DMU_BACKUP_FEATURE_EMBED_DATA here to * allow sending ZSTD compressed datasets to a receiver that does not * support ZSTD */ if ((*featureflags & (DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_RAW)) != 0 && dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_ZSTD_COMPRESS)) { *featureflags |= DMU_BACKUP_FEATURE_ZSTD; } if (dspp->resumeobj != 0 || dspp->resumeoff != 0) { *featureflags |= DMU_BACKUP_FEATURE_RESUMING; } if (dspp->redactbook != NULL) { *featureflags |= DMU_BACKUP_FEATURE_REDACTED; } if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE)) { *featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; } if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LONGNAME)) { *featureflags |= DMU_BACKUP_FEATURE_LONGNAME; } + + if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_MICROZAP)) { + /* + * We must never split a large microzap block, so we can only + * send large microzaps if LARGE_BLOCKS is already enabled. + */ + if (!(*featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ZFS_ERR_STREAM_LARGE_MICROZAP)); + *featureflags |= DMU_BACKUP_FEATURE_LARGE_MICROZAP; + } + return (0); } static dmu_replay_record_t * create_begin_record(struct dmu_send_params *dspp, objset_t *os, uint64_t featureflags) { dmu_replay_record_t *drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; struct drr_begin *drrb = &drr->drr_u.drr_begin; dsl_dataset_t *to_ds = dspp->to_ds; drrb->drr_magic = DMU_BACKUP_MAGIC; drrb->drr_creation_time = dsl_dataset_phys(to_ds)->ds_creation_time; drrb->drr_type = dmu_objset_type(os); drrb->drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; drrb->drr_fromguid = dspp->ancestor_zb.zbm_guid; DMU_SET_STREAM_HDRTYPE(drrb->drr_versioninfo, DMU_SUBSTREAM); DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, featureflags); if (dspp->is_clone) drrb->drr_flags |= DRR_FLAG_CLONE; if (dsl_dataset_phys(dspp->to_ds)->ds_flags & DS_FLAG_CI_DATASET) drrb->drr_flags |= DRR_FLAG_CI_DATA; if (zfs_send_set_freerecords_bit) drrb->drr_flags |= DRR_FLAG_FREERECORDS; drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK; if (dspp->savedok) { drrb->drr_toguid = dspp->saved_guid; strlcpy(drrb->drr_toname, dspp->saved_toname, sizeof (drrb->drr_toname)); } else { dsl_dataset_name(to_ds, drrb->drr_toname); if (!to_ds->ds_is_snapshot) { (void) strlcat(drrb->drr_toname, "@--head--", sizeof (drrb->drr_toname)); } } return (drr); } static void setup_to_thread(struct send_thread_arg *to_arg, objset_t *to_os, dmu_sendstatus_t *dssp, uint64_t fromtxg, boolean_t rawok) { VERIFY0(bqueue_init(&to_arg->q, zfs_send_no_prefetch_queue_ff, MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), offsetof(struct send_range, ln))); to_arg->error_code = 0; to_arg->cancel = B_FALSE; to_arg->os = to_os; to_arg->fromtxg = fromtxg; to_arg->flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA; if (rawok) to_arg->flags |= TRAVERSE_NO_DECRYPT; if (zfs_send_corrupt_data) to_arg->flags |= TRAVERSE_HARD; to_arg->num_blocks_visited = &dssp->dss_blocks; (void) thread_create(NULL, 0, send_traverse_thread, to_arg, 0, curproc, TS_RUN, minclsyspri); } static void setup_from_thread(struct redact_list_thread_arg *from_arg, redaction_list_t *from_rl, dmu_sendstatus_t *dssp) { VERIFY0(bqueue_init(&from_arg->q, zfs_send_no_prefetch_queue_ff, MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), offsetof(struct send_range, ln))); from_arg->error_code = 0; from_arg->cancel = B_FALSE; from_arg->rl = from_rl; from_arg->mark_redact = B_FALSE; from_arg->num_blocks_visited = &dssp->dss_blocks; /* * If from_ds is null, send_traverse_thread just returns success and * enqueues an eos marker. */ (void) thread_create(NULL, 0, redact_list_thread, from_arg, 0, curproc, TS_RUN, minclsyspri); } static void setup_redact_list_thread(struct redact_list_thread_arg *rlt_arg, struct dmu_send_params *dspp, redaction_list_t *rl, dmu_sendstatus_t *dssp) { if (dspp->redactbook == NULL) return; rlt_arg->cancel = B_FALSE; VERIFY0(bqueue_init(&rlt_arg->q, zfs_send_no_prefetch_queue_ff, MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), offsetof(struct send_range, ln))); rlt_arg->error_code = 0; rlt_arg->mark_redact = B_TRUE; rlt_arg->rl = rl; rlt_arg->num_blocks_visited = &dssp->dss_blocks; (void) thread_create(NULL, 0, redact_list_thread, rlt_arg, 0, curproc, TS_RUN, minclsyspri); } static void setup_merge_thread(struct send_merge_thread_arg *smt_arg, struct dmu_send_params *dspp, struct redact_list_thread_arg *from_arg, struct send_thread_arg *to_arg, struct redact_list_thread_arg *rlt_arg, objset_t *os) { VERIFY0(bqueue_init(&smt_arg->q, zfs_send_no_prefetch_queue_ff, MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), offsetof(struct send_range, ln))); smt_arg->cancel = B_FALSE; smt_arg->error = 0; smt_arg->from_arg = from_arg; smt_arg->to_arg = to_arg; if (dspp->redactbook != NULL) smt_arg->redact_arg = rlt_arg; smt_arg->os = os; (void) thread_create(NULL, 0, send_merge_thread, smt_arg, 0, curproc, TS_RUN, minclsyspri); } static void setup_reader_thread(struct send_reader_thread_arg *srt_arg, struct dmu_send_params *dspp, struct send_merge_thread_arg *smt_arg, uint64_t featureflags) { VERIFY0(bqueue_init(&srt_arg->q, zfs_send_queue_ff, MAX(zfs_send_queue_length, 2 * zfs_max_recordsize), offsetof(struct send_range, ln))); srt_arg->smta = smt_arg; srt_arg->issue_reads = !dspp->dso->dso_dryrun; srt_arg->featureflags = featureflags; (void) thread_create(NULL, 0, send_reader_thread, srt_arg, 0, curproc, TS_RUN, minclsyspri); } static int setup_resume_points(struct dmu_send_params *dspp, struct send_thread_arg *to_arg, struct redact_list_thread_arg *from_arg, struct redact_list_thread_arg *rlt_arg, struct send_merge_thread_arg *smt_arg, boolean_t resuming, objset_t *os, redaction_list_t *redact_rl, nvlist_t *nvl) { (void) smt_arg; dsl_dataset_t *to_ds = dspp->to_ds; int err = 0; uint64_t obj = 0; uint64_t blkid = 0; if (resuming) { obj = dspp->resumeobj; dmu_object_info_t to_doi; err = dmu_object_info(os, obj, &to_doi); if (err != 0) return (err); blkid = dspp->resumeoff / to_doi.doi_data_block_size; } /* * If we're resuming a redacted send, we can skip to the appropriate * point in the redaction bookmark by binary searching through it. */ if (redact_rl != NULL) { SET_BOOKMARK(&rlt_arg->resume, to_ds->ds_object, obj, 0, blkid); } SET_BOOKMARK(&to_arg->resume, to_ds->ds_object, obj, 0, blkid); if (nvlist_exists(nvl, BEGINNV_REDACT_FROM_SNAPS)) { uint64_t objset = dspp->ancestor_zb.zbm_redaction_obj; /* * Note: If the resume point is in an object whose * blocksize is different in the from vs to snapshots, * we will have divided by the "wrong" blocksize. * However, in this case fromsnap's send_cb() will * detect that the blocksize has changed and therefore * ignore this object. * * If we're resuming a send from a redaction bookmark, * we still cannot accidentally suggest blocks behind * the to_ds. In addition, we know that any blocks in * the object in the to_ds will have to be sent, since * the size changed. Therefore, we can't cause any harm * this way either. */ SET_BOOKMARK(&from_arg->resume, objset, obj, 0, blkid); } if (resuming) { fnvlist_add_uint64(nvl, BEGINNV_RESUME_OBJECT, dspp->resumeobj); fnvlist_add_uint64(nvl, BEGINNV_RESUME_OFFSET, dspp->resumeoff); } return (0); } static dmu_sendstatus_t * setup_send_progress(struct dmu_send_params *dspp) { dmu_sendstatus_t *dssp = kmem_zalloc(sizeof (*dssp), KM_SLEEP); dssp->dss_outfd = dspp->outfd; dssp->dss_off = dspp->off; dssp->dss_proc = curproc; mutex_enter(&dspp->to_ds->ds_sendstream_lock); list_insert_head(&dspp->to_ds->ds_sendstreams, dssp); mutex_exit(&dspp->to_ds->ds_sendstream_lock); return (dssp); } /* * Actually do the bulk of the work in a zfs send. * * The idea is that we want to do a send from ancestor_zb to to_ds. We also * want to not send any data that has been modified by all the datasets in * redactsnaparr, and store the list of blocks that are redacted in this way in * a bookmark named redactbook, created on the to_ds. We do this by creating * several worker threads, whose function is described below. * * There are three cases. * The first case is a redacted zfs send. In this case there are 5 threads. * The first thread is the to_ds traversal thread: it calls dataset_traverse on * the to_ds and finds all the blocks that have changed since ancestor_zb (if * it's a full send, that's all blocks in the dataset). It then sends those * blocks on to the send merge thread. The redact list thread takes the data * from the redaction bookmark and sends those blocks on to the send merge * thread. The send merge thread takes the data from the to_ds traversal * thread, and combines it with the redaction records from the redact list * thread. If a block appears in both the to_ds's data and the redaction data, * the send merge thread will mark it as redacted and send it on to the prefetch * thread. Otherwise, the send merge thread will send the block on to the * prefetch thread unchanged. The prefetch thread will issue prefetch reads for * any data that isn't redacted, and then send the data on to the main thread. * The main thread behaves the same as in a normal send case, issuing demand * reads for data blocks and sending out records over the network * * The graphic below diagrams the flow of data in the case of a redacted zfs * send. Each box represents a thread, and each line represents the flow of * data. * * Records from the | * redaction bookmark | * +--------------------+ | +---------------------------+ * | | v | Send Merge Thread | * | Redact List Thread +----------> Apply redaction marks to | * | | | records as specified by | * +--------------------+ | redaction ranges | * +----^---------------+------+ * | | Merged data * | | * | +------------v--------+ * | | Prefetch Thread | * +--------------------+ | | Issues prefetch | * | to_ds Traversal | | | reads of data blocks| * | Thread (finds +---------------+ +------------+--------+ * | candidate blocks) | Blocks modified | Prefetched data * +--------------------+ by to_ds since | * ancestor_zb +------------v----+ * | Main Thread | File Descriptor * | Sends data over +->(to zfs receive) * | wire | * +-----------------+ * * The second case is an incremental send from a redaction bookmark. The to_ds * traversal thread and the main thread behave the same as in the redacted * send case. The new thread is the from bookmark traversal thread. It * iterates over the redaction list in the redaction bookmark, and enqueues * records for each block that was redacted in the original send. The send * merge thread now has to merge the data from the two threads. For details * about that process, see the header comment of send_merge_thread(). Any data * it decides to send on will be prefetched by the prefetch thread. Note that * you can perform a redacted send from a redaction bookmark; in that case, * the data flow behaves very similarly to the flow in the redacted send case, * except with the addition of the bookmark traversal thread iterating over the * redaction bookmark. The send_merge_thread also has to take on the * responsibility of merging the redact list thread's records, the bookmark * traversal thread's records, and the to_ds records. * * +---------------------+ * | | * | Redact List Thread +--------------+ * | | | * +---------------------+ | * Blocks in redaction list | Ranges modified by every secure snap * of from bookmark | (or EOS if not readcted) * | * +---------------------+ | +----v----------------------+ * | bookmark Traversal | v | Send Merge Thread | * | Thread (finds +---------> Merges bookmark, rlt, and | * | candidate blocks) | | to_ds send records | * +---------------------+ +----^---------------+------+ * | | Merged data * | +------------v--------+ * | | Prefetch Thread | * +--------------------+ | | Issues prefetch | * | to_ds Traversal | | | reads of data blocks| * | Thread (finds +---------------+ +------------+--------+ * | candidate blocks) | Blocks modified | Prefetched data * +--------------------+ by to_ds since +------------v----+ * ancestor_zb | Main Thread | File Descriptor * | Sends data over +->(to zfs receive) * | wire | * +-----------------+ * * The final case is a simple zfs full or incremental send. The to_ds traversal * thread behaves the same as always. The redact list thread is never started. * The send merge thread takes all the blocks that the to_ds traversal thread * sends it, prefetches the data, and sends the blocks on to the main thread. * The main thread sends the data over the wire. * * To keep performance acceptable, we want to prefetch the data in the worker * threads. While the to_ds thread could simply use the TRAVERSE_PREFETCH * feature built into traverse_dataset, the combining and deletion of records * due to redaction and sends from redaction bookmarks mean that we could * issue many unnecessary prefetches. As a result, we only prefetch data * after we've determined that the record is not going to be redacted. To * prevent the prefetching from getting too far ahead of the main thread, the * blocking queues that are used for communication are capped not by the * number of entries in the queue, but by the sum of the size of the * prefetches associated with them. The limit on the amount of data that the * thread can prefetch beyond what the main thread has reached is controlled * by the global variable zfs_send_queue_length. In addition, to prevent poor * performance in the beginning of a send, we also limit the distance ahead * that the traversal threads can be. That distance is controlled by the * zfs_send_no_prefetch_queue_length tunable. * * Note: Releases dp using the specified tag. */ static int dmu_send_impl(struct dmu_send_params *dspp) { objset_t *os; dmu_replay_record_t *drr; dmu_sendstatus_t *dssp; dmu_send_cookie_t dsc = {0}; int err; uint64_t fromtxg = dspp->ancestor_zb.zbm_creation_txg; uint64_t featureflags = 0; struct redact_list_thread_arg *from_arg; struct send_thread_arg *to_arg; struct redact_list_thread_arg *rlt_arg; struct send_merge_thread_arg *smt_arg; struct send_reader_thread_arg *srt_arg; struct send_range *range; redaction_list_t *from_rl = NULL; redaction_list_t *redact_rl = NULL; boolean_t resuming = (dspp->resumeobj != 0 || dspp->resumeoff != 0); boolean_t book_resuming = resuming; dsl_dataset_t *to_ds = dspp->to_ds; zfs_bookmark_phys_t *ancestor_zb = &dspp->ancestor_zb; dsl_pool_t *dp = dspp->dp; const void *tag = dspp->tag; err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } /* * If this is a non-raw send of an encrypted ds, we can ensure that * the objset_phys_t is authenticated. This is safe because this is * either a snapshot or we have owned the dataset, ensuring that * it can't be modified. */ if (!dspp->rawok && os->os_encrypted && arc_is_unauthenticated(os->os_phys_buf)) { zbookmark_phys_t zb; SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = arc_untransform(os->os_phys_buf, os->os_spa, &zb, B_FALSE); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } ASSERT0(arc_is_unauthenticated(os->os_phys_buf)); } if ((err = setup_featureflags(dspp, os, &featureflags)) != 0) { dsl_pool_rele(dp, tag); return (err); } /* * If we're doing a redacted send, hold the bookmark's redaction list. */ if (dspp->redactbook != NULL) { err = dsl_redaction_list_hold_obj(dp, dspp->redactbook->zbm_redaction_obj, FTAG, &redact_rl); if (err != 0) { dsl_pool_rele(dp, tag); return (SET_ERROR(EINVAL)); } dsl_redaction_list_long_hold(dp, redact_rl, FTAG); } /* * If we're sending from a redaction bookmark, hold the redaction list * so that we can consider sending the redacted blocks. */ if (ancestor_zb->zbm_redaction_obj != 0) { err = dsl_redaction_list_hold_obj(dp, ancestor_zb->zbm_redaction_obj, FTAG, &from_rl); if (err != 0) { if (redact_rl != NULL) { dsl_redaction_list_long_rele(redact_rl, FTAG); dsl_redaction_list_rele(redact_rl, FTAG); } dsl_pool_rele(dp, tag); return (SET_ERROR(EINVAL)); } dsl_redaction_list_long_hold(dp, from_rl, FTAG); } dsl_dataset_long_hold(to_ds, FTAG); from_arg = kmem_zalloc(sizeof (*from_arg), KM_SLEEP); to_arg = kmem_zalloc(sizeof (*to_arg), KM_SLEEP); rlt_arg = kmem_zalloc(sizeof (*rlt_arg), KM_SLEEP); smt_arg = kmem_zalloc(sizeof (*smt_arg), KM_SLEEP); srt_arg = kmem_zalloc(sizeof (*srt_arg), KM_SLEEP); drr = create_begin_record(dspp, os, featureflags); dssp = setup_send_progress(dspp); dsc.dsc_drr = drr; dsc.dsc_dso = dspp->dso; dsc.dsc_os = os; dsc.dsc_off = dspp->off; dsc.dsc_toguid = dsl_dataset_phys(to_ds)->ds_guid; dsc.dsc_fromtxg = fromtxg; dsc.dsc_pending_op = PENDING_NONE; dsc.dsc_featureflags = featureflags; dsc.dsc_resume_object = dspp->resumeobj; dsc.dsc_resume_offset = dspp->resumeoff; dsl_pool_rele(dp, tag); void *payload = NULL; size_t payload_len = 0; nvlist_t *nvl = fnvlist_alloc(); /* * If we're doing a redacted send, we include the snapshots we're * redacted with respect to so that the target system knows what send * streams can be correctly received on top of this dataset. If we're * instead sending a redacted dataset, we include the snapshots that the * dataset was created with respect to. */ if (dspp->redactbook != NULL) { fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS, redact_rl->rl_phys->rlp_snaps, redact_rl->rl_phys->rlp_num_snaps); } else if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_REDACTED_DATASETS)) { uint64_t *tods_guids; uint64_t length; VERIFY(dsl_dataset_get_uint64_array_feature(to_ds, SPA_FEATURE_REDACTED_DATASETS, &length, &tods_guids)); fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS, tods_guids, length); } /* * If we're sending from a redaction bookmark, then we should retrieve * the guids of that bookmark so we can send them over the wire. */ if (from_rl != NULL) { fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS, from_rl->rl_phys->rlp_snaps, from_rl->rl_phys->rlp_num_snaps); } /* * If the snapshot we're sending from is redacted, include the redaction * list in the stream. */ if (dspp->numfromredactsnaps != NUM_SNAPS_NOT_REDACTED) { ASSERT3P(from_rl, ==, NULL); fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS, dspp->fromredactsnaps, (uint_t)dspp->numfromredactsnaps); if (dspp->numfromredactsnaps > 0) { kmem_free(dspp->fromredactsnaps, dspp->numfromredactsnaps * sizeof (uint64_t)); dspp->fromredactsnaps = NULL; } } if (resuming || book_resuming) { err = setup_resume_points(dspp, to_arg, from_arg, rlt_arg, smt_arg, resuming, os, redact_rl, nvl); if (err != 0) goto out; } if (featureflags & DMU_BACKUP_FEATURE_RAW) { uint64_t ivset_guid = ancestor_zb->zbm_ivset_guid; nvlist_t *keynvl = NULL; ASSERT(os->os_encrypted); err = dsl_crypto_populate_key_nvlist(os, ivset_guid, &keynvl); if (err != 0) { fnvlist_free(nvl); goto out; } fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl); fnvlist_free(keynvl); } if (!nvlist_empty(nvl)) { payload = fnvlist_pack(nvl, &payload_len); drr->drr_payloadlen = payload_len; } fnvlist_free(nvl); err = dump_record(&dsc, payload, payload_len); fnvlist_pack_free(payload, payload_len); if (err != 0) { err = dsc.dsc_err; goto out; } setup_to_thread(to_arg, os, dssp, fromtxg, dspp->rawok); setup_from_thread(from_arg, from_rl, dssp); setup_redact_list_thread(rlt_arg, dspp, redact_rl, dssp); setup_merge_thread(smt_arg, dspp, from_arg, to_arg, rlt_arg, os); setup_reader_thread(srt_arg, dspp, smt_arg, featureflags); range = bqueue_dequeue(&srt_arg->q); while (err == 0 && !range->eos_marker) { err = do_dump(&dsc, range); range = get_next_range(&srt_arg->q, range); if (issig()) err = SET_ERROR(EINTR); } /* * If we hit an error or are interrupted, cancel our worker threads and * clear the queue of any pending records. The threads will pass the * cancel up the tree of worker threads, and each one will clean up any * pending records before exiting. */ if (err != 0) { srt_arg->cancel = B_TRUE; while (!range->eos_marker) { range = get_next_range(&srt_arg->q, range); } } range_free(range); bqueue_destroy(&srt_arg->q); bqueue_destroy(&smt_arg->q); if (dspp->redactbook != NULL) bqueue_destroy(&rlt_arg->q); bqueue_destroy(&to_arg->q); bqueue_destroy(&from_arg->q); if (err == 0 && srt_arg->error != 0) err = srt_arg->error; if (err != 0) goto out; if (dsc.dsc_pending_op != PENDING_NONE) if (dump_record(&dsc, NULL, 0) != 0) err = SET_ERROR(EINTR); if (err != 0) { if (err == EINTR && dsc.dsc_err != 0) err = dsc.dsc_err; goto out; } /* * Send the DRR_END record if this is not a saved stream. * Otherwise, the omitted DRR_END record will signal to * the receive side that the stream is incomplete. */ if (!dspp->savedok) { memset(drr, 0, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; drr->drr_u.drr_end.drr_checksum = dsc.dsc_zc; drr->drr_u.drr_end.drr_toguid = dsc.dsc_toguid; if (dump_record(&dsc, NULL, 0) != 0) err = dsc.dsc_err; } out: mutex_enter(&to_ds->ds_sendstream_lock); list_remove(&to_ds->ds_sendstreams, dssp); mutex_exit(&to_ds->ds_sendstream_lock); VERIFY(err != 0 || (dsc.dsc_sent_begin && (dsc.dsc_sent_end || dspp->savedok))); kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dssp, sizeof (dmu_sendstatus_t)); kmem_free(from_arg, sizeof (*from_arg)); kmem_free(to_arg, sizeof (*to_arg)); kmem_free(rlt_arg, sizeof (*rlt_arg)); kmem_free(smt_arg, sizeof (*smt_arg)); kmem_free(srt_arg, sizeof (*srt_arg)); dsl_dataset_long_rele(to_ds, FTAG); if (from_rl != NULL) { dsl_redaction_list_long_rele(from_rl, FTAG); dsl_redaction_list_rele(from_rl, FTAG); } if (redact_rl != NULL) { dsl_redaction_list_long_rele(redact_rl, FTAG); dsl_redaction_list_rele(redact_rl, FTAG); } return (err); } int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, boolean_t savedok, int outfd, offset_t *off, dmu_send_outparams_t *dsop) { int err; dsl_dataset_t *fromds; ds_hold_flags_t dsflags; struct dmu_send_params dspp = {0}; dspp.embedok = embedok; dspp.large_block_ok = large_block_ok; dspp.compressok = compressok; dspp.outfd = outfd; dspp.off = off; dspp.dso = dsop; dspp.tag = FTAG; dspp.rawok = rawok; dspp.savedok = savedok; dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; err = dsl_pool_hold(pool, FTAG, &dspp.dp); if (err != 0) return (err); err = dsl_dataset_hold_obj_flags(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); if (err != 0) { dsl_pool_rele(dspp.dp, FTAG); return (err); } if (fromsnap != 0) { err = dsl_dataset_hold_obj_flags(dspp.dp, fromsnap, dsflags, FTAG, &fromds); if (err != 0) { dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); dsl_pool_rele(dspp.dp, FTAG); return (err); } dspp.ancestor_zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; dspp.ancestor_zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; dspp.ancestor_zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; if (dsl_dataset_is_zapified(fromds)) { (void) zap_lookup(dspp.dp->dp_meta_objset, fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, &dspp.ancestor_zb.zbm_ivset_guid); } /* See dmu_send for the reasons behind this. */ uint64_t *fromredact; if (!dsl_dataset_get_uint64_array_feature(fromds, SPA_FEATURE_REDACTED_DATASETS, &dspp.numfromredactsnaps, &fromredact)) { dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; } else if (dspp.numfromredactsnaps > 0) { uint64_t size = dspp.numfromredactsnaps * sizeof (uint64_t); dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP); memcpy(dspp.fromredactsnaps, fromredact, size); } boolean_t is_before = dsl_dataset_is_before(dspp.to_ds, fromds, 0); dspp.is_clone = (dspp.to_ds->ds_dir != fromds->ds_dir); dsl_dataset_rele(fromds, FTAG); if (!is_before) { dsl_pool_rele(dspp.dp, FTAG); err = SET_ERROR(EXDEV); } else { err = dmu_send_impl(&dspp); } } else { dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; err = dmu_send_impl(&dspp); } if (dspp.fromredactsnaps) kmem_free(dspp.fromredactsnaps, dspp.numfromredactsnaps * sizeof (uint64_t)); dsl_dataset_rele(dspp.to_ds, FTAG); return (err); } int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, boolean_t savedok, uint64_t resumeobj, uint64_t resumeoff, const char *redactbook, int outfd, offset_t *off, dmu_send_outparams_t *dsop) { int err = 0; ds_hold_flags_t dsflags; boolean_t owned = B_FALSE; dsl_dataset_t *fromds = NULL; zfs_bookmark_phys_t book = {0}; struct dmu_send_params dspp = {0}; dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; dspp.tosnap = tosnap; dspp.embedok = embedok; dspp.large_block_ok = large_block_ok; dspp.compressok = compressok; dspp.outfd = outfd; dspp.off = off; dspp.dso = dsop; dspp.tag = FTAG; dspp.resumeobj = resumeobj; dspp.resumeoff = resumeoff; dspp.rawok = rawok; dspp.savedok = savedok; if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) return (SET_ERROR(EINVAL)); err = dsl_pool_hold(tosnap, FTAG, &dspp.dp); if (err != 0) return (err); if (strchr(tosnap, '@') == NULL && spa_writeable(dspp.dp->dp_spa)) { /* * We are sending a filesystem or volume. Ensure * that it doesn't change by owning the dataset. */ if (savedok) { /* * We are looking for the dataset that represents the * partially received send stream. If this stream was * received as a new snapshot of an existing dataset, * this will be saved in a hidden clone named * "//%recv". Otherwise, the stream * will be saved in the live dataset itself. In * either case we need to use dsl_dataset_own_force() * because the stream is marked as inconsistent, * which would normally make it unavailable to be * owned. */ char *name = kmem_asprintf("%s/%s", tosnap, recv_clone_name); err = dsl_dataset_own_force(dspp.dp, name, dsflags, FTAG, &dspp.to_ds); if (err == ENOENT) { err = dsl_dataset_own_force(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); } if (err == 0) { owned = B_TRUE; err = zap_lookup(dspp.dp->dp_meta_objset, dspp.to_ds->ds_object, DS_FIELD_RESUME_TOGUID, 8, 1, &dspp.saved_guid); } if (err == 0) { err = zap_lookup(dspp.dp->dp_meta_objset, dspp.to_ds->ds_object, DS_FIELD_RESUME_TONAME, 1, sizeof (dspp.saved_toname), dspp.saved_toname); } /* Only disown if there was an error in the lookups */ if (owned && (err != 0)) dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); kmem_strfree(name); } else { err = dsl_dataset_own(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); if (err == 0) owned = B_TRUE; } } else { err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); } if (err != 0) { /* Note: dsl dataset is not owned at this point */ dsl_pool_rele(dspp.dp, FTAG); return (err); } if (redactbook != NULL) { char path[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(path, tosnap, sizeof (path)); char *at = strchr(path, '@'); if (at == NULL) { err = EINVAL; } else { (void) snprintf(at, sizeof (path) - (at - path), "#%s", redactbook); err = dsl_bookmark_lookup(dspp.dp, path, NULL, &book); dspp.redactbook = &book; } } if (err != 0) { dsl_pool_rele(dspp.dp, FTAG); if (owned) dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); else dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); return (err); } if (fromsnap != NULL) { zfs_bookmark_phys_t *zb = &dspp.ancestor_zb; int fsnamelen; if (strpbrk(tosnap, "@#") != NULL) fsnamelen = strpbrk(tosnap, "@#") - tosnap; else fsnamelen = strlen(tosnap); /* * If the fromsnap is in a different filesystem, then * mark the send stream as a clone. */ if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || (fromsnap[fsnamelen] != '@' && fromsnap[fsnamelen] != '#')) { dspp.is_clone = B_TRUE; } if (strchr(fromsnap, '@') != NULL) { err = dsl_dataset_hold(dspp.dp, fromsnap, FTAG, &fromds); if (err != 0) { ASSERT3P(fromds, ==, NULL); } else { /* * We need to make a deep copy of the redact * snapshots of the from snapshot, because the * array will be freed when we evict from_ds. */ uint64_t *fromredact; if (!dsl_dataset_get_uint64_array_feature( fromds, SPA_FEATURE_REDACTED_DATASETS, &dspp.numfromredactsnaps, &fromredact)) { dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; } else if (dspp.numfromredactsnaps > 0) { uint64_t size = dspp.numfromredactsnaps * sizeof (uint64_t); dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP); memcpy(dspp.fromredactsnaps, fromredact, size); } if (!dsl_dataset_is_before(dspp.to_ds, fromds, 0)) { err = SET_ERROR(EXDEV); } else { zb->zbm_creation_txg = dsl_dataset_phys(fromds)-> ds_creation_txg; zb->zbm_creation_time = dsl_dataset_phys(fromds)-> ds_creation_time; zb->zbm_guid = dsl_dataset_phys(fromds)->ds_guid; zb->zbm_redaction_obj = 0; if (dsl_dataset_is_zapified(fromds)) { (void) zap_lookup( dspp.dp->dp_meta_objset, fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, &zb->zbm_ivset_guid); } } dsl_dataset_rele(fromds, FTAG); } } else { dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; err = dsl_bookmark_lookup(dspp.dp, fromsnap, dspp.to_ds, zb); if (err == EXDEV && zb->zbm_redaction_obj != 0 && zb->zbm_guid == dsl_dataset_phys(dspp.to_ds)->ds_guid) err = 0; } if (err == 0) { /* dmu_send_impl will call dsl_pool_rele for us. */ err = dmu_send_impl(&dspp); } else { if (dspp.fromredactsnaps) kmem_free(dspp.fromredactsnaps, dspp.numfromredactsnaps * sizeof (uint64_t)); dsl_pool_rele(dspp.dp, FTAG); } } else { dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; err = dmu_send_impl(&dspp); } if (owned) dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); else dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); return (err); } static int dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) { int err = 0; uint64_t size; /* * Assume that space (both on-disk and in-stream) is dominated by * data. We will adjust for indirect blocks and the copies property, * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). */ uint64_t recordsize; uint64_t record_count; objset_t *os; VERIFY0(dmu_objset_from_ds(ds, &os)); /* Assume all (uncompressed) blocks are recordsize. */ if (zfs_override_estimate_recordsize != 0) { recordsize = zfs_override_estimate_recordsize; } else if (os->os_phys->os_type == DMU_OST_ZVOL) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize); } else { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize); } if (err != 0) return (err); record_count = uncompressed / recordsize; /* * If we're estimating a send size for a compressed stream, use the * compressed data size to estimate the stream size. Otherwise, use the * uncompressed data size. */ size = stream_compressed ? compressed : uncompressed; /* * Subtract out approximate space used by indirect blocks. * Assume most space is used by data blocks (non-indirect, non-dnode). * Assume no ditto blocks or internal fragmentation. * * Therefore, space used by indirect blocks is sizeof(blkptr_t) per * block. */ size -= record_count * sizeof (blkptr_t); /* Add in the space for the record associated with each block. */ size += record_count * sizeof (dmu_replay_record_t); *sizep = size; return (0); } int dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds, zfs_bookmark_phys_t *frombook, boolean_t stream_compressed, boolean_t saved, uint64_t *sizep) { int err; dsl_dataset_t *ds = origds; uint64_t uncomp, comp; ASSERT(dsl_pool_config_held(origds->ds_dir->dd_pool)); ASSERT(fromds == NULL || frombook == NULL); /* * If this is a saved send we may actually be sending * from the %recv clone used for resuming. */ if (saved) { objset_t *mos = origds->ds_dir->dd_pool->dp_meta_objset; uint64_t guid; char dsname[ZFS_MAX_DATASET_NAME_LEN + 6]; dsl_dataset_name(origds, dsname); (void) strcat(dsname, "/"); (void) strlcat(dsname, recv_clone_name, sizeof (dsname)); err = dsl_dataset_hold(origds->ds_dir->dd_pool, dsname, FTAG, &ds); if (err != ENOENT && err != 0) { return (err); } else if (err == ENOENT) { ds = origds; } /* check that this dataset has partially received data */ err = zap_lookup(mos, ds->ds_object, DS_FIELD_RESUME_TOGUID, 8, 1, &guid); if (err != 0) { err = SET_ERROR(err == ENOENT ? EINVAL : err); goto out; } err = zap_lookup(mos, ds->ds_object, DS_FIELD_RESUME_TONAME, 1, sizeof (dsname), dsname); if (err != 0) { err = SET_ERROR(err == ENOENT ? EINVAL : err); goto out; } } /* tosnap must be a snapshot or the target of a saved send */ if (!ds->ds_is_snapshot && ds == origds) return (SET_ERROR(EINVAL)); if (fromds != NULL) { uint64_t used; if (!fromds->ds_is_snapshot) { err = SET_ERROR(EINVAL); goto out; } if (!dsl_dataset_is_before(ds, fromds, 0)) { err = SET_ERROR(EXDEV); goto out; } err = dsl_dataset_space_written(fromds, ds, &used, &comp, &uncomp); if (err != 0) goto out; } else if (frombook != NULL) { uint64_t used; err = dsl_dataset_space_written_bookmark(frombook, ds, &used, &comp, &uncomp); if (err != 0) goto out; } else { uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; comp = dsl_dataset_phys(ds)->ds_compressed_bytes; } err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, stream_compressed, sizep); /* * Add the size of the BEGIN and END records to the estimate. */ *sizep += 2 * sizeof (dmu_replay_record_t); out: if (ds != origds) dsl_dataset_rele(ds, FTAG); return (err); } ZFS_MODULE_PARAM(zfs_send, zfs_send_, corrupt_data, INT, ZMOD_RW, "Allow sending corrupt data"); ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, UINT, ZMOD_RW, "Maximum send queue length"); ZFS_MODULE_PARAM(zfs_send, zfs_send_, unmodified_spill_blocks, INT, ZMOD_RW, "Send unmodified spill blocks"); ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, UINT, ZMOD_RW, "Maximum send queue length for non-prefetch queues"); ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, UINT, ZMOD_RW, "Send queue fill fraction"); ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, UINT, ZMOD_RW, "Send queue fill fraction for non-prefetch queues"); ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, UINT, ZMOD_RW, "Override block size estimate with fixed size"); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 2c2a6c7642a5..3fdcebdff918 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1,1577 +1,1577 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, { "dmu_tx_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_error", KSTAT_DATA_UINT64 }, { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, { "dmu_tx_group", KSTAT_DATA_UINT64 }, { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; static kstat_t *dmu_tx_ksp; dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); tx->tx_dir = dd; if (dd != NULL) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); tx->tx_start = gethrtime(); return (tx); } dmu_tx_t * dmu_tx_create(objset_t *os) { dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; return (tx); } dmu_tx_t * dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) { dmu_tx_t *tx = dmu_tx_create_dd(NULL); TXG_VERIFY(dp->dp_spa, txg); tx->tx_pool = dp; tx->tx_txg = txg; tx->tx_anyobj = TRUE; return (tx); } int dmu_tx_is_syncing(dmu_tx_t *tx) { return (tx->tx_anyobj); } int dmu_tx_private_ok(dmu_tx_t *tx) { return (tx->tx_anyobj); } static dmu_tx_hold_t * dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) { dmu_tx_hold_t *txh; if (dn != NULL) { (void) zfs_refcount_add(&dn->dn_holds, tx); if (tx->tx_txg != 0) { mutex_enter(&dn->dn_mtx); /* * dn->dn_assigned_txg == tx->tx_txg doesn't pose a * problem, but there's no way for it to happen (for * now, at least). */ ASSERT(dn->dn_assigned_txg == 0); dn->dn_assigned_txg = tx->tx_txg; (void) zfs_refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } } txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); txh->txh_tx = tx; txh->txh_dnode = dn; zfs_refcount_create(&txh->txh_space_towrite); zfs_refcount_create(&txh->txh_memory_tohold); txh->txh_type = type; txh->txh_arg1 = arg1; txh->txh_arg2 = arg2; list_insert_tail(&tx->tx_holds, txh); return (txh); } static dmu_tx_hold_t * dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) { dnode_t *dn = NULL; dmu_tx_hold_t *txh; int err; if (object != DMU_NEW_OBJECT) { err = dnode_hold(os, object, FTAG, &dn); if (err != 0) { tx->tx_err = err; return (NULL); } } txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); if (dn != NULL) dnode_rele(dn, FTAG); return (txh); } void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) { /* * If we're syncing, they can manipulate any object anyhow, and * the hold on the dnode_t can cause problems. */ if (!dmu_tx_is_syncing(tx)) (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); } /* * This function reads specified data from disk. The specified data will * be needed to perform the transaction -- i.e, it will be read after * we do dmu_tx_assign(). There are two reasons that we read the data now * (before dmu_tx_assign()): * * 1. Reading it now has potentially better performance. The transaction * has not yet been assigned, so the TXG is not held open, and also the * caller typically has less locks held when calling dmu_tx_hold_*() than * after the transaction has been assigned. This reduces the lock (and txg) * hold times, thus reducing lock contention. * * 2. It is easier for callers (primarily the ZPL) to handle i/o errors * that are detected before they start making changes to the DMU state * (i.e. now). Once the transaction has been assigned, and some DMU * state has been changed, it can be difficult to recover from an i/o * error (e.g. to undo the changes already made in memory at the DMU * layer). Typically code to do so does not exist in the caller -- it * assumes that the data has already been cached and thus i/o errors are * not possible. * * It has been observed that the i/o initiated here can be a performance * problem, and it appears to be optional, because we don't look at the * data which is read. However, removing this read would only serve to * move the work elsewhere (after the dmu_tx_assign()), where it may * have a greater impact on performance (in addition to the impact on * fault tolerance noted above). */ static int dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) { int err; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, level, blkid, TRUE, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) return (0); if (err != 0) return (err); /* * PARTIAL_FIRST allows caching for uncacheable blocks. It will * be cleared after dmu_buf_will_dirty() call dbuf_read() again. */ err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH | (level == 0 ? DB_RF_PARTIAL_FIRST : 0)); dbuf_rele(db, FTAG); return (err); } static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; int err = 0; if (len == 0) return; (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); if (dn == NULL) return; /* * For i/o error checking, read the blocks that will be needed * to perform the write: the first and last level-0 blocks (if * they are not aligned, i.e. if they are partial-block writes), * and all the level-1 blocks. */ if (dn->dn_maxblkid == 0) { if (off < dn->dn_datablksz && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { txh->txh_tx->tx_err = err; } } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ uint64_t start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err != 0) { txh->txh_tx->tx_err = err; } } /* last level-0 block */ uint64_t end = (off + len - 1) >> dn->dn_datablkshift; if (end != start && end <= dn->dn_maxblkid && P2PHASE(off + len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err != 0) { txh->txh_tx->tx_err = err; } } /* level-1 blocks */ if (dn->dn_nlevels > 1) { int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = (start >> shft) + 1; i < end >> shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { txh->txh_tx->tx_err = err; } } } err = zio_wait(zio); if (err != 0) { txh->txh_tx->tx_err = err; } } } static void dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; int err = 0; if (len == 0) return; (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); if (dn == NULL) return; /* * For i/o error checking, read the blocks that will be needed * to perform the append; first level-0 block (if not aligned, i.e. * if they are partial-block writes), no additional blocks are read. */ if (dn->dn_maxblkid == 0) { if (off < dn->dn_datablksz && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { txh->txh_tx->tx_err = err; } } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ uint64_t start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err != 0) { txh->txh_tx->tx_err = err; } } err = zio_wait(zio); if (err != 0) { txh->txh_tx->tx_err = err; } } } static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { (void) zfs_refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE, FTAG); } void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE, off, len); if (txh != NULL) { dmu_tx_count_write(txh, off, len); dmu_tx_count_dnode(txh); } } void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); if (txh != NULL) { dmu_tx_count_write(txh, off, len); dmu_tx_count_dnode(txh); } } /* * Should be used when appending to an object and the exact offset is unknown. * The write must occur at or beyond the specified offset. Only the L0 block * at provided offset will be prefetched. */ void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_APPEND, off, DMU_OBJECT_END); if (txh != NULL) { dmu_tx_count_append(txh, off, len); dmu_tx_count_dnode(txh); } } void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END); if (txh != NULL) { dmu_tx_count_append(txh, off, len); dmu_tx_count_dnode(txh); } } /* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and * this transaction will be able to use half of the pool space overhead * (see dsl_pool_adjustedsize()). Therefore this function should only * be called for transactions that we expect will not cause a net increase * in the amount of space used (but it's OK if that is occasionally not true). */ void dmu_tx_mark_netfree(dmu_tx_t *tx) { tx->tx_netfree = B_TRUE; } static void dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dmu_tx_t *tx = txh->txh_tx; dnode_t *dn = txh->txh_dnode; int err; ASSERT(tx->tx_txg == 0); if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; /* * For i/o error checking, we read the first and last level-0 * blocks if they are not aligned, and all the level-1 blocks. * * Note: dbuf_free_range() assumes that we have not instantiated * any level-0 dbufs that will be completely freed. Therefore we must * exercise care to not read or count the first and last blocks * if they are blocksize-aligned. */ if (dn->dn_datablkshift == 0) { if (off != 0 || len < dn->dn_datablksz) dmu_tx_count_write(txh, 0, dn->dn_datablksz); } else { /* first block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off, 1); /* last block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off + len, 1); } /* * Check level-1 blocks. */ if (dn->dn_nlevels > 1) { int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t start = off >> shift; uint64_t end = (off + len) >> shift; ASSERT(dn->dn_indblkshift != 0); /* * dnode_reallocate() can result in an object with indirect * blocks having an odd data block size. In this case, * just check the single block. */ if (dn->dn_datablkshift == 0) start = end = 0; zio_t *zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (uint64_t i = start; i <= end; i++) { uint64_t ibyte = i << shift; err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; if (err == ESRCH || i > end) break; if (err != 0) { tx->tx_err = err; (void) zio_wait(zio); return; } (void) zfs_refcount_add_many(&txh->txh_memory_tohold, 1 << dn->dn_indblkshift, FTAG); err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { tx->tx_err = err; (void) zio_wait(zio); return; } } err = zio_wait(zio); if (err != 0) { tx->tx_err = err; return; } } } void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE, off, len); if (txh != NULL) { dmu_tx_count_dnode(txh); dmu_tx_count_free(txh, off, len); } } void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); if (txh != NULL) { dmu_tx_count_dnode(txh); dmu_tx_count_free(txh, off, len); } } static void dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { /* * Reuse dmu_tx_count_free(), it does exactly what we need for clone. */ dmu_tx_count_free(txh, off, len); } void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len); if (txh != NULL) { dmu_tx_count_dnode(txh); dmu_tx_count_clone(txh, off, len); } } static void dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) { dmu_tx_t *tx = txh->txh_tx; dnode_t *dn = txh->txh_dnode; int err; - extern int zap_micro_max_size; ASSERT(tx->tx_txg == 0); dmu_tx_count_dnode(txh); /* * Modifying a almost-full microzap is around the worst case (128KB) * * If it is a fat zap, the worst case would be 7*16KB=112KB: * - 3 blocks overwritten: target leaf, ptrtbl block, header block * - 4 new blocks written if adding: * - 2 blocks for possibly split leaves, * - 2 grown ptrtbl blocks */ (void) zfs_refcount_add_many(&txh->txh_space_towrite, - zap_micro_max_size, FTAG); + zap_get_micro_max_size(tx->tx_pool->dp_spa), FTAG); if (dn == NULL) return; ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 || name == NULL) { /* * This is a microzap (only one block), or we don't know * the name. Check the first block for i/o errors. */ err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { tx->tx_err = err; } } else { /* * Access the name so that we'll check for i/o errors to * the leaf blocks, etc. We ignore ENOENT, as this name * may not yet exist. */ err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); if (err == EIO || err == ECKSUM || err == ENXIO) { tx->tx_err = err; } } } void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP, add, (uintptr_t)name); if (txh != NULL) dmu_tx_hold_zap_impl(txh, name); } void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT(dn != NULL); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); if (txh != NULL) dmu_tx_hold_zap_impl(txh, name); } void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS, 0, 0); if (txh) dmu_tx_count_dnode(txh); } void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); if (txh) dmu_tx_count_dnode(txh); } void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, space, 0); if (txh) { (void) zfs_refcount_add_many( &txh->txh_space_towrite, space, FTAG); } } #ifdef ZFS_DEBUG void dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { boolean_t match_object = B_FALSE; boolean_t match_offset = B_FALSE; DB_DNODE_ENTER(db); dnode_t *dn = DB_DNODE(db); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); if (tx->tx_anyobj) { DB_DNODE_EXIT(db); return; } /* XXX No checking on the meta dnode for now */ if (db->db.db_object == DMU_META_DNODE_OBJECT) { DB_DNODE_EXIT(db); return; } for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) match_object = TRUE; if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { int datablkshift = dn->dn_datablkshift ? dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; int shift = datablkshift + epbs * db->db_level; uint64_t beginblk = shift >= 64 ? 0 : (txh->txh_arg1 >> shift); uint64_t endblk = shift >= 64 ? 0 : ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); uint64_t blkid = db->db_blkid; /* XXX txh_arg2 better not be zero... */ dprintf("found txh type %x beginblk=%llx endblk=%llx\n", txh->txh_type, (u_longlong_t)beginblk, (u_longlong_t)endblk); switch (txh->txh_type) { case THT_WRITE: if (blkid >= beginblk && blkid <= endblk) match_offset = TRUE; /* * We will let this hold work for the bonus * or spill buffer so that we don't need to * hold it when creating a new object. */ if (blkid == DMU_BONUS_BLKID || blkid == DMU_SPILL_BLKID) match_offset = TRUE; /* * They might have to increase nlevels, * thus dirtying the new TLIBs. Or the * might have to change the block size, * thus dirying the new lvl=0 blk=0. */ if (blkid == 0) match_offset = TRUE; break; case THT_APPEND: if (blkid >= beginblk && (blkid <= endblk || txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; /* * THT_WRITE used for bonus and spill blocks. */ ASSERT(blkid != DMU_BONUS_BLKID && blkid != DMU_SPILL_BLKID); /* * They might have to increase nlevels, * thus dirtying the new TLIBs. Or the * might have to change the block size, * thus dirying the new lvl=0 blk=0. */ if (blkid == 0) match_offset = TRUE; break; case THT_FREE: /* * We will dirty all the level 1 blocks in * the free range and perhaps the first and * last level 0 block. */ if (blkid >= beginblk && (blkid <= endblk || txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; case THT_SPILL: if (blkid == DMU_SPILL_BLKID) match_offset = TRUE; break; case THT_BONUS: if (blkid == DMU_BONUS_BLKID) match_offset = TRUE; break; case THT_ZAP: match_offset = TRUE; break; case THT_NEWOBJECT: match_object = TRUE; break; case THT_CLONE: if (blkid >= beginblk && blkid <= endblk) match_offset = TRUE; break; default: cmn_err(CE_PANIC, "bad txh_type %d", txh->txh_type); } } if (match_object && match_offset) { DB_DNODE_EXIT(db); return; } } DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); } #endif /* * If we can't do 10 iops, something is wrong. Let us go ahead * and hit zfs_dirty_data_max. */ static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ /* * We delay transactions when we've determined that the backend storage * isn't able to accommodate the rate of incoming writes. * * If there is already a transaction waiting, we delay relative to when * that transaction finishes waiting. This way the calculated min_time * is independent of the number of threads concurrently executing * transactions. * * If we are the only waiter, wait relative to when the transaction * started, rather than the current time. This credits the transaction for * "time already served", e.g. reading indirect blocks. * * The minimum time for a transaction to take is calculated as: * min_time = scale * (dirty - min) / (max - dirty) * min_time is then capped at zfs_delay_max_ns. * * The delay has two degrees of freedom that can be adjusted via tunables. * The percentage of dirty data at which we start to delay is defined by * zfs_delay_min_dirty_percent. This should typically be at or above * zfs_vdev_async_write_active_max_dirty_percent so that we only start to * delay after writing at full speed has failed to keep up with the incoming * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly * speaking, this variable determines the amount of delay at the midpoint of * the curve. * * delay * 10ms +-------------------------------------------------------------*+ * | *| * 9ms + *+ * | *| * 8ms + *+ * | * | * 7ms + * + * | * | * 6ms + * + * | * | * 5ms + * + * | * | * 4ms + * + * | * | * 3ms + * + * | * | * 2ms + (midpoint) * + * | | ** | * 1ms + v *** + * | zfs_delay_scale ----------> ******** | * 0 +-------------------------------------*********----------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note that since the delay is added to the outstanding time remaining on the * most recent transaction, the delay is effectively the inverse of IOPS. * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve * was chosen such that small changes in the amount of accumulated dirty data * in the first 3/4 of the curve yield relatively small differences in the * amount of delay. * * The effects can be easier to understand when the amount of delay is * represented on a log scale: * * delay * 100ms +-------------------------------------------------------------++ * + + * | | * + *+ * 10ms + *+ * + ** + * | (midpoint) ** | * + | ** + * 1ms + v **** + * + zfs_delay_scale ----------> ***** + * | **** | * + **** + * 100us + ** + * + * + * | * | * + * + * 10us + * + * + + * | | * + + * +--------------------------------------------------------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note here that only as the amount of dirty data approaches its limit does * the delay start to increase rapidly. The goal of a properly tuned system * should be to keep the amount of dirty data out of that range by first * ensuring that the appropriate limits are set for the I/O scheduler to reach * optimal throughput on the backend storage, and then by changing the value * of zfs_delay_scale to increase the steepness of the curve. */ static void dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; uint64_t delay_min_bytes, wrlog; hrtime_t wakeup, tx_time = 0, now; /* Calculate minimum transaction time for the dirty data amount. */ delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; if (dirty > delay_min_bytes) { /* * The caller has already waited until we are under the max. * We make them pass us the amount of dirty data so we don't * have to handle the case of it being >= the max, which * could cause a divide-by-zero if it's == the max. */ ASSERT3U(dirty, <, zfs_dirty_data_max); tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); } /* Calculate minimum transaction time for the TX_WRITE log size. */ wrlog = aggsum_upper_bound(&dp->dp_wrlog_total); delay_min_bytes = zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; if (wrlog >= zfs_wrlog_data_max) { tx_time = zfs_delay_max_ns; } else if (wrlog > delay_min_bytes) { tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) / (zfs_wrlog_data_max - wrlog), tx_time); } if (tx_time == 0) return; tx_time = MIN(tx_time, zfs_delay_max_ns); now = gethrtime(); if (now > tx->tx_start + tx_time) return; DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, uint64_t, tx_time); mutex_enter(&dp->dp_lock); wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time); dp->dp_last_wakeup = wakeup; mutex_exit(&dp->dp_lock); zfs_sleep_until(wakeup); } /* * This routine attempts to assign the transaction to a transaction group. * To do so, we must determine if there is sufficient free space on disk. * * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() * on it), then it is assumed that there is sufficient free space, * unless there's insufficient slop space in the pool (see the comment * above spa_slop_shift in spa_misc.c). * * If it is not a "netfree" transaction, then if the data already on disk * is over the allowed usage (e.g. quota), this will fail with EDQUOT or * ENOSPC. Otherwise, if the current rough estimate of pending changes, * plus the rough estimate of this transaction's changes, may exceed the * allowed usage, then this will fail with ERESTART, which will cause the * caller to wait for the pending changes to be written to disk (by waiting * for the next TXG to open), and then check the space usage again. * * The rough estimate of pending changes is comprised of the sum of: * * - this transaction's holds' txh_space_towrite * * - dd_tempreserved[], which is the sum of in-flight transactions' * holds' txh_space_towrite (i.e. those transactions that have called * dmu_tx_assign() but not yet called dmu_tx_commit()). * * - dd_space_towrite[], which is the amount of dirtied dbufs. * * Note that all of these values are inflated by spa_get_worst_case_asize(), * which means that we may get ERESTART well before we are actually in danger * of running out of space, but this also mitigates any small inaccuracies * in the rough estimate (e.g. txh_space_towrite doesn't take into account * indirect blocks, and dd_space_towrite[] doesn't take into account changes * to the MOS). * * Note that due to this algorithm, it is possible to exceed the allowed * usage by one transaction. Also, as we approach the allowed usage, * we will allow a very limited amount of changes into each TXG, thus * decreasing performance. */ static int dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) { spa_t *spa = tx->tx_pool->dp_spa; ASSERT0(tx->tx_txg); if (tx->tx_err) { DMU_TX_STAT_BUMP(dmu_tx_error); return (tx->tx_err); } if (spa_suspended(spa)) { DMU_TX_STAT_BUMP(dmu_tx_suspended); /* * If the user has indicated a blocking failure mode * then return ERESTART which will block in dmu_tx_wait(). * Otherwise, return EIO so that an error can get * propagated back to the VOP calls. * * Note that we always honor the txg_how flag regardless * of the failuremode setting. */ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && !(txg_how & TXG_WAIT)) return (SET_ERROR(EIO)); return (SET_ERROR(ERESTART)); } if (!tx->tx_dirty_delayed && dsl_pool_need_wrlog_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay); return (SET_ERROR(ERESTART)); } if (!tx->tx_dirty_delayed && dsl_pool_need_dirty_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); return (SET_ERROR(ERESTART)); } tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); tx->tx_needassign_txh = NULL; /* * NB: No error returns are allowed after txg_hold_open, but * before processing the dnode holds, due to the * dmu_tx_unassign() logic. */ uint64_t towrite = 0; uint64_t tohold = 0; for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn != NULL) { /* * This thread can't hold the dn_struct_rwlock * while assigning the tx, because this can lead to * deadlock. Specifically, if this dnode is already * assigned to an earlier txg, this thread may need * to wait for that txg to sync (the ERESTART case * below). The other thread that has assigned this * dnode to an earlier txg prevents this txg from * syncing until its tx can complete (calling * dmu_tx_commit()), but it may need to acquire the * dn_struct_rwlock to do so (e.g. via * dmu_buf_hold*()). * * Note that this thread can't hold the lock for * read either, but the rwlock doesn't record * enough information to make that assertion. */ ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock)); mutex_enter(&dn->dn_mtx); if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = txh; DMU_TX_STAT_BUMP(dmu_tx_group); return (SET_ERROR(ERESTART)); } if (dn->dn_assigned_txg == 0) dn->dn_assigned_txg = tx->tx_txg; ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); (void) zfs_refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } towrite += zfs_refcount_count(&txh->txh_space_towrite); tohold += zfs_refcount_count(&txh->txh_memory_tohold); } /* needed allocation: worst-case estimate of write space */ uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); /* calculate memory footprint estimate */ uint64_t memory = towrite + tohold; if (tx->tx_dir != NULL && asize != 0) { int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); if (err != 0) return (err); } DMU_TX_STAT_BUMP(dmu_tx_assigned); return (0); } static void dmu_tx_unassign(dmu_tx_t *tx) { if (tx->tx_txg == 0) return; txg_rele_to_quiesce(&tx->tx_txgh); /* * Walk the transaction's hold list, removing the hold on the * associated dnode, and notifying waiters if the refcount drops to 0. */ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh && txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); } txg_rele_to_sync(&tx->tx_txgh); tx->tx_lasttried_txg = tx->tx_txg; tx->tx_txg = 0; } /* * Assign tx to a transaction group; txg_how is a bitmask: * * If TXG_WAIT is set and the currently open txg is full, this function * will wait until there's a new txg. This should be used when no locks * are being held. With this bit set, this function will only fail if * we're truly out of space (or over quota). * * If TXG_WAIT is *not* set and we can't assign into the currently open * txg without blocking, this function will return immediately with * ERESTART. This should be used whenever locks are being held. On an * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), * and try again. * * If TXG_NOTHROTTLE is set, this indicates that this tx should not be * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for * details on the throttle). This is used by the VFS operations, after * they have already called dmu_tx_wait() (though most likely on a * different tx). * * It is guaranteed that subsequent successful calls to dmu_tx_assign() * will assign the tx to monotonically increasing txgs. Of course this is * not strong monotonicity, because the same txg can be returned multiple * times in a row. This guarantee holds both for subsequent calls from * one thread and for multiple threads. For example, it is impossible to * observe the following sequence of events: * * Thread 1 Thread 2 * * dmu_tx_assign(T1, ...) * 1 <- dmu_tx_get_txg(T1) * dmu_tx_assign(T2, ...) * 2 <- dmu_tx_get_txg(T2) * dmu_tx_assign(T3, ...) * 1 <- dmu_tx_get_txg(T3) */ int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) { int err; ASSERT(tx->tx_txg == 0); ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); /* If we might wait, we must not hold the config lock. */ IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); if ((txg_how & TXG_NOTHROTTLE)) tx->tx_dirty_delayed = B_TRUE; while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); if (err != ERESTART || !(txg_how & TXG_WAIT)) return (err); dmu_tx_wait(tx); } txg_rele_to_quiesce(&tx->tx_txgh); return (0); } void dmu_tx_wait(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dsl_pool_t *dp = tx->tx_pool; hrtime_t before; ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); before = gethrtime(); if (tx->tx_wait_dirty) { uint64_t dirty; /* * dmu_tx_try_assign() has determined that we need to wait * because we've consumed much or all of the dirty buffer * space. */ mutex_enter(&dp->dp_lock); if (dp->dp_dirty_total >= zfs_dirty_data_max) DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); while (dp->dp_dirty_total >= zfs_dirty_data_max) cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); dmu_tx_delay(tx, dirty); tx->tx_wait_dirty = B_FALSE; /* * Note: setting tx_dirty_delayed only has effect if the * caller used TX_WAIT. Otherwise they are going to * destroy this tx and try again. The common case, * zfs_write(), uses TX_WAIT. */ tx->tx_dirty_delayed = B_TRUE; } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { /* * If the pool is suspended we need to wait until it * is resumed. Note that it's possible that the pool * has become active after this thread has tried to * obtain a tx. If that's the case then tx_lasttried_txg * would not have been set. */ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } else if (tx->tx_needassign_txh) { dnode_t *dn = tx->tx_needassign_txh->txh_dnode; mutex_enter(&dn->dn_mtx); while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) cv_wait(&dn->dn_notxholds, &dn->dn_mtx); mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = NULL; } else { /* * If we have a lot of dirty data just wait until we sync * out a TXG at which point we'll hopefully have synced * a portion of the changes. */ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } spa_tx_assign_add_nsecs(spa, gethrtime() - before); } static void dmu_tx_destroy(dmu_tx_t *tx) { dmu_tx_hold_t *txh; while ((txh = list_head(&tx->tx_holds)) != NULL) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); zfs_refcount_destroy_many(&txh->txh_space_towrite, zfs_refcount_count(&txh->txh_space_towrite)); zfs_refcount_destroy_many(&txh->txh_memory_tohold, zfs_refcount_count(&txh->txh_memory_tohold)); kmem_free(txh, sizeof (dmu_tx_hold_t)); if (dn != NULL) dnode_rele(dn, tx); } list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); kmem_free(tx, sizeof (dmu_tx_t)); } void dmu_tx_commit(dmu_tx_t *tx) { ASSERT(tx->tx_txg != 0); /* * Go through the transaction's hold list and remove holds on * associated dnodes, notifying waiters if no holds remain. */ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); } if (tx->tx_tempreserve_cookie) dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); if (!list_is_empty(&tx->tx_callbacks)) txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); dmu_tx_destroy(tx); } void dmu_tx_abort(dmu_tx_t *tx) { ASSERT(tx->tx_txg == 0); /* * Call any registered callbacks with an error code. */ if (!list_is_empty(&tx->tx_callbacks)) dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED)); dmu_tx_destroy(tx); } uint64_t dmu_tx_get_txg(dmu_tx_t *tx) { ASSERT(tx->tx_txg != 0); return (tx->tx_txg); } dsl_pool_t * dmu_tx_pool(dmu_tx_t *tx) { ASSERT(tx->tx_pool != NULL); return (tx->tx_pool); } void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { dmu_tx_callback_t *dcb; dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); dcb->dcb_func = func; dcb->dcb_data = data; list_insert_tail(&tx->tx_callbacks, dcb); } /* * Call all the commit callbacks on a list, with a given error code. */ void dmu_tx_do_callbacks(list_t *cb_list, int error) { dmu_tx_callback_t *dcb; while ((dcb = list_remove_tail(cb_list)) != NULL) { dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } } /* * Interface to hold a bunch of attributes. * used for creating new files. * attrsize is the total size of all attributes * to be added during object creation * * For updating/adding a single attribute dmu_tx_hold_sa() should be used. */ /* * hold necessary attribute name for attribute registration. * should be a very rare case where this is needed. If it does * happen it would only happen on the first write to the file system. */ static void dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) { if (!sa->sa_need_attr_registration) return; for (int i = 0; i != sa->sa_num_attrs; i++) { if (!sa->sa_attr_table[i].sa_registered) { if (sa->sa_reg_attr_obj) dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, B_TRUE, sa->sa_attr_table[i].sa_name); else dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, sa->sa_attr_table[i].sa_name); } } } void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_SPILL, 0, 0); if (txh != NULL) (void) zfs_refcount_add_many(&txh->txh_space_towrite, SPA_OLD_MAXBLOCKSIZE, FTAG); } void dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) { sa_os_t *sa = tx->tx_objset->os_sa; dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_layout_attr_obj) { dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); } else { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) return; (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPILL, 0, 0); } /* * Hold SA attribute * * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) * * variable_size is the total size of all variable sized attributes * passed to this function. It is not the total size of all * variable size attributes that *may* exist on this object. */ void dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) { uint64_t object; sa_os_t *sa = tx->tx_objset->os_sa; ASSERT(hdl != NULL); object = sa_handle_object(hdl); dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; DB_DNODE_ENTER(db); dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db)); DB_DNODE_EXIT(db); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); if (sa->sa_force_spill || may_grow || hdl->sa_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } else { DB_DNODE_ENTER(db); if (DB_DNODE(db)->dn_have_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } DB_DNODE_EXIT(db); } } void dmu_tx_init(void) { dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dmu_tx_ksp != NULL) { dmu_tx_ksp->ks_data = &dmu_tx_stats; kstat_install(dmu_tx_ksp); } } void dmu_tx_fini(void) { if (dmu_tx_ksp != NULL) { kstat_delete(dmu_tx_ksp); dmu_tx_ksp = NULL; } } #if defined(_KERNEL) EXPORT_SYMBOL(dmu_tx_create); EXPORT_SYMBOL(dmu_tx_hold_write); EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_append); EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_free); EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_zap); EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_bonus); EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode); EXPORT_SYMBOL(dmu_tx_abort); EXPORT_SYMBOL(dmu_tx_assign); EXPORT_SYMBOL(dmu_tx_wait); EXPORT_SYMBOL(dmu_tx_commit); EXPORT_SYMBOL(dmu_tx_mark_netfree); EXPORT_SYMBOL(dmu_tx_get_txg); EXPORT_SYMBOL(dmu_tx_callback_register); EXPORT_SYMBOL(dmu_tx_do_callbacks); EXPORT_SYMBOL(dmu_tx_hold_spill); EXPORT_SYMBOL(dmu_tx_hold_sa_create); EXPORT_SYMBOL(dmu_tx_hold_sa); #endif diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index a428a040a4a3..12938022e976 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -1,1937 +1,1988 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2024, Klara, Inc. */ #include #include #include #include #include #include #include #include #include #include +#include #ifdef _KERNEL #include #endif -int zap_micro_max_size = MZAP_MAX_BLKSZ; +/* + * The maximum size (in bytes) of a microzap before it is converted to a + * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE). + * + * By definition, a microzap must fit into a single block, so this has + * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default. + * Setting this higher requires both the large_blocks feature (to even create + * blocks that large) and the large_microzap feature (to enable the stream + * machinery to understand not to try to split a microzap block). + * + * If large_microzap is enabled, this value will be clamped to + * spa_maxblocksize(). If not, it will be clamped to SPA_OLD_MAXBLOCKSIZE. + */ +static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE; + +uint64_t +zap_get_micro_max_size(spa_t *spa) +{ + uint64_t maxsz = P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE); + if (maxsz <= SPA_OLD_MAXBLOCKSIZE) + return (maxsz); + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP)) + return (MIN(maxsz, spa_maxblocksize(spa))); + return (SPA_OLD_MAXBLOCKSIZE); +} static int mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags); uint64_t zap_getflags(zap_t *zap) { if (zap->zap_ismicro) return (0); return (zap_f_phys(zap)->zap_flags); } int zap_hashbits(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return (48); else return (28); } uint32_t zap_maxcd(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return ((1<<16)-1); else return (-1U); } static uint64_t zap_hash(zap_name_t *zn) { zap_t *zap = zn->zn_zap; uint64_t h = 0; if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); h = *(uint64_t *)zn->zn_key_orig; } else { h = zap->zap_salt; ASSERT(h != 0); ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { const uint64_t *wp = zn->zn_key_norm; ASSERT(zn->zn_key_intlen == 8); for (int i = 0; i < zn->zn_key_norm_numints; wp++, i++) { uint64_t word = *wp; for (int j = 0; j < 8; j++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ word) & 0xFF]; word >>= NBBY; } } } else { const uint8_t *cp = zn->zn_key_norm; /* * We previously stored the terminating null on * disk, but didn't hash it, so we need to * continue to not hash it. (The * zn_key_*_numints includes the terminating * null for non-binary keys.) */ int len = zn->zn_key_norm_numints - 1; ASSERT(zn->zn_key_intlen == 1); for (int i = 0; i < len; cp++, i++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ *cp) & 0xFF]; } } } /* * Don't use all 64 bits, since we need some in the cookie for * the collision differentiator. We MUST use the high bits, * since those are the ones that we first pay attention to when * choosing the bucket. */ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); return (h); } static int zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags, size_t outlen) { ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); size_t inlen = strlen(name) + 1; int err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); return (err); } boolean_t zap_match(zap_name_t *zn, const char *matchname) { boolean_t res = B_FALSE; ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); if (zn->zn_matchtype & MT_NORMALIZE) { size_t namelen = zn->zn_normbuf_len; char normbuf[ZAP_MAXNAMELEN]; char *norm = normbuf; /* * Cannot allocate this on-stack as it exceed the stack-limit of * 1024. */ if (namelen > ZAP_MAXNAMELEN) norm = kmem_alloc(namelen, KM_SLEEP); if (zap_normalize(zn->zn_zap, matchname, norm, zn->zn_normflags, namelen) != 0) { res = B_FALSE; } else { res = (strcmp(zn->zn_key_norm, norm) == 0); } if (norm != normbuf) kmem_free(norm, namelen); } else { res = (strcmp(zn->zn_key_orig, matchname) == 0); } return (res); } static kmem_cache_t *zap_name_cache; static kmem_cache_t *zap_attr_cache; static kmem_cache_t *zap_name_long_cache; static kmem_cache_t *zap_attr_long_cache; void zap_init(void) { zap_name_cache = kmem_cache_create("zap_name", sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL, NULL, NULL, NULL, 0); zap_attr_cache = kmem_cache_create("zap_attr_cache", sizeof (zap_attribute_t) + ZAP_MAXNAMELEN, 0, NULL, NULL, NULL, NULL, NULL, 0); zap_name_long_cache = kmem_cache_create("zap_name_long", sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL, NULL, NULL, NULL, 0); zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache", sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL, NULL, NULL, NULL, 0); } void zap_fini(void) { kmem_cache_destroy(zap_name_cache); kmem_cache_destroy(zap_attr_cache); kmem_cache_destroy(zap_name_long_cache); kmem_cache_destroy(zap_attr_long_cache); } static zap_name_t * zap_name_alloc(zap_t *zap, boolean_t longname) { kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache; zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP); zn->zn_zap = zap; zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; return (zn); } void zap_name_free(zap_name_t *zn) { if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) { kmem_cache_free(zap_name_cache, zn); } else { ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW); kmem_cache_free(zap_name_long_cache, zn); } } static int zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) { zap_t *zap = zn->zn_zap; size_t key_len = strlen(key) + 1; /* Make sure zn is allocated for longname if key is long */ IMPLY(key_len > ZAP_MAXNAMELEN, zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW); zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = key; zn->zn_key_orig_numints = key_len; zn->zn_matchtype = mt; zn->zn_normflags = zap->zap_normflags; /* * If we're dealing with a case sensitive lookup on a mixed or * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup * will fold case to all caps overriding the lookup request. */ if (mt & MT_MATCH_CASE) zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; if (zap->zap_normflags) { /* * We *must* use zap_normflags because this normalization is * what the hash is computed from. */ if (zap_normalize(zap, key, zn->zn_normbuf, zap->zap_normflags, zn->zn_normbuf_len) != 0) return (SET_ERROR(ENOTSUP)); zn->zn_key_norm = zn->zn_normbuf; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { if (mt != 0) return (SET_ERROR(ENOTSUP)); zn->zn_key_norm = zn->zn_key_orig; zn->zn_key_norm_numints = zn->zn_key_orig_numints; } zn->zn_hash = zap_hash(zn); if (zap->zap_normflags != zn->zn_normflags) { /* * We *must* use zn_normflags because this normalization is * what the matching is based on. (Not the hash!) */ if (zap_normalize(zap, key, zn->zn_normbuf, zn->zn_normflags, zn->zn_normbuf_len) != 0) return (SET_ERROR(ENOTSUP)); zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } return (0); } zap_name_t * zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) { size_t key_len = strlen(key) + 1; zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN)); if (zap_name_init_str(zn, key, mt) != 0) { zap_name_free(zn); return (NULL); } return (zn); } static zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) { zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP); ASSERT(zap->zap_normflags == 0); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = zn->zn_key_norm = key; zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; zn->zn_matchtype = 0; zn->zn_normbuf_len = ZAP_MAXNAMELEN; zn->zn_hash = zap_hash(zn); return (zn); } static void mzap_byteswap(mzap_phys_t *buf, size_t size) { buf->mz_block_type = BSWAP_64(buf->mz_block_type); buf->mz_salt = BSWAP_64(buf->mz_salt); buf->mz_normflags = BSWAP_64(buf->mz_normflags); int max = (size / MZAP_ENT_LEN) - 1; for (int i = 0; i < max; i++) { buf->mz_chunk[i].mze_value = BSWAP_64(buf->mz_chunk[i].mze_value); buf->mz_chunk[i].mze_cd = BSWAP_32(buf->mz_chunk[i].mze_cd); } } void zap_byteswap(void *buf, size_t size) { uint64_t block_type = *(uint64_t *)buf; if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { /* ASSERT(magic == ZAP_LEAF_MAGIC); */ mzap_byteswap(buf, size); } else { fzap_byteswap(buf, size); } } __attribute__((always_inline)) inline static int mze_compare(const void *arg1, const void *arg2) { const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd, (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); } ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, mze_compare) static void mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) { mzap_ent_t mze; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); mze.mze_chunkid = chunkid; ASSERT0(hash & 0xffffffff); mze.mze_hash = hash >> 32; ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff); mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd; ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0); zfs_btree_add(&zap->zap_m.zap_tree, &mze); } static mzap_ent_t * mze_find(zap_name_t *zn, zfs_btree_index_t *idx) { mzap_ent_t mze_tofind; mzap_ent_t *mze; zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree; ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); ASSERT0(zn->zn_hash & 0xffffffff); mze_tofind.mze_hash = zn->zn_hash >> 32; mze_tofind.mze_cd = 0; mze = zfs_btree_find(tree, &mze_tofind, idx); if (mze == NULL) mze = zfs_btree_next(tree, idx, idx); for (; mze && mze->mze_hash == mze_tofind.mze_hash; mze = zfs_btree_next(tree, idx, idx)) { ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } return (NULL); } static uint32_t mze_find_unused_cd(zap_t *zap, uint64_t hash) { mzap_ent_t mze_tofind; zfs_btree_index_t idx; zfs_btree_t *tree = &zap->zap_m.zap_tree; ASSERT(zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT0(hash & 0xffffffff); hash >>= 32; mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; uint32_t cd = 0; for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = zfs_btree_next(tree, &idx, &idx)) { if (mze->mze_cd != cd) break; cd++; } return (cd); } /* * Each mzap entry requires at max : 4 chunks * 3 chunks for names + 1 chunk for value. */ #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \ ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t))) /* * Check if the current entry keeps the colliding entries under the fatzap leaf * size. */ static boolean_t mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) { zap_t *zap = zn->zn_zap; mzap_ent_t mze_tofind; zfs_btree_index_t idx; zfs_btree_t *tree = &zap->zap_m.zap_tree; uint32_t mzap_ents = 0; ASSERT0(hash & 0xffffffff); hash >>= 32; mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = zfs_btree_next(tree, &idx, &idx)) { mzap_ents++; } /* Include the new entry being added */ mzap_ents++; return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); } static void mze_destroy(zap_t *zap) { zfs_btree_clear(&zap->zap_m.zap_tree); zfs_btree_destroy(&zap->zap_m.zap_tree); } static zap_t * mzap_open(dmu_buf_t *db) { zap_t *winner; uint64_t *zap_hdr = (uint64_t *)db->db_data; uint64_t zap_block_type = zap_hdr[0]; uint64_t zap_magic = zap_hdr[1]; ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&zap->zap_rwlock, RW_WRITER); zap->zap_objset = dmu_buf_get_objset(db); zap->zap_object = db->db_object; zap->zap_dbuf = db; if (zap_block_type != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0); zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { winner = NULL; /* No actual winner here... */ goto handle_winner; } } else { zap->zap_ismicro = TRUE; } /* * Make sure that zap_ismicro is set before we let others see * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); winner = dmu_buf_set_user(db, &zap->zap_dbu); if (winner != NULL) goto handle_winner; if (zap->zap_ismicro) { zap->zap_salt = zap_m_phys(zap)->mz_salt; zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; /* * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove() * overhead on massive inserts below. It still allows to store * 62 entries before we have to add 2KB B-tree core node. */ zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, mze_find_in_buf, sizeof (mzap_ent_t), 512); zap_name_t *zn = zap_name_alloc(zap, B_FALSE); for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { zap->zap_m.zap_num_entries++; zap_name_init_str(zn, mze->mze_name, 0); mze_insert(zap, i, zn->zn_hash); } } zap_name_free(zn); } else { zap->zap_salt = zap_f_phys(zap)->zap_salt; zap->zap_normflags = zap_f_phys(zap)->zap_normflags; ASSERT3U(sizeof (struct zap_leaf_header), ==, 2*ZAP_LEAF_CHUNKSIZE); /* * The embedded pointer table should not overlap the * other members. */ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, &zap_f_phys(zap)->zap_salt); /* * The embedded pointer table should end at the end of * the block */ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 1<zap_dbuf->db_size); } rw_exit(&zap->zap_rwlock); return (zap); handle_winner: rw_exit(&zap->zap_rwlock); rw_destroy(&zap->zap_rwlock); if (!zap->zap_ismicro) mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); return (winner); } /* * This routine "consumes" the caller's hold on the dbuf, which must * have the specified tag. */ static int zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { ASSERT0(db->db_offset); objset_t *os = dmu_buf_get_objset(db); uint64_t obj = db->db_object; dmu_object_info_t doi; *zapp = NULL; dmu_object_info_from_dnode(dn, &doi); if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) return (SET_ERROR(EINVAL)); zap_t *zap = dmu_buf_get_user(db); if (zap == NULL) { zap = mzap_open(db); if (zap == NULL) { /* * mzap_open() didn't like what it saw on-disk. * Check for corruption! */ return (SET_ERROR(EIO)); } } /* * We're checking zap_ismicro without the lock held, in order to * tell what type of lock we want. Once we have some sort of * lock, see if it really is the right type. In practice this * can only be different if it was upgraded from micro to fat, * and micro wanted WRITER but fat only needs READER. */ krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; rw_enter(&zap->zap_rwlock, lt); if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { /* it was upgraded, now we only need reader */ ASSERT(lt == RW_WRITER); ASSERT(RW_READER == ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); rw_downgrade(&zap->zap_rwlock); lt = RW_READER; } zap->zap_objset = os; zap->zap_dnode = dn; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3P(zap->zap_dbuf, ==, db); ASSERT(!zap->zap_ismicro || zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; - if (newsz > zap_micro_max_size) { + if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) { dprintf("upgrading obj %llu: num_entries=%u\n", (u_longlong_t)obj, zap->zap_m.zap_num_entries); *zapp = zap; int err = mzap_upgrade(zapp, tag, tx, 0); if (err != 0) rw_exit(&zap->zap_rwlock); return (err); } VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; + + if (newsz > SPA_OLD_MAXBLOCKSIZE) { + dsl_dataset_t *ds = dmu_objset_ds(os); + if (!dsl_dataset_feature_is_active(ds, + SPA_FEATURE_LARGE_MICROZAP)) { + /* + * A microzap just grew beyond the old limit + * for the first time, so we have to ensure the + * feature flag is activated. + * zap_get_micro_max_size() won't let us get + * here if the feature is not enabled, so we + * don't need any other checks beforehand. + * + * Since we're in open context, we can't + * activate the feature directly, so we instead + * flag it on the dataset for next sync. + */ + dsl_dataset_dirty(ds, tx); + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation + [SPA_FEATURE_LARGE_MICROZAP] = + (void *)B_TRUE; + mutex_exit(&ds->ds_lock); + } + } } *zapp = zap; return (0); } static int zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, zap_t **zapp) { dmu_buf_t *db; int err; err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) dmu_buf_rele(db, tag); else VERIFY(dnode_add_ref(dn, tag)); return (err); } int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, zap_t **zapp) { dnode_t *dn; dmu_buf_t *db; int err; err = dnode_hold(os, obj, tag, &dn); if (err != 0) return (err); err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) { dnode_rele(dn, tag); return (err); } err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) { dmu_buf_rele(db, tag); dnode_rele(dn, tag); } return (err); } void zap_unlockdir(zap_t *zap, const void *tag) { rw_exit(&zap->zap_rwlock); dnode_rele(zap->zap_dnode, tag); dmu_buf_rele(zap->zap_dbuf, tag); } static int mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) { int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); int sz = zap->zap_dbuf->db_size; mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP); memcpy(mzp, zap->zap_dbuf->db_data, sz); int nchunks = zap->zap_m.zap_num_chunks; if (!flags) { err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err != 0) { vmem_free(mzp, sz); return (err); } } dprintf("upgrading obj=%llu with %u chunks\n", (u_longlong_t)zap->zap_object, nchunks); /* XXX destroy the tree later, so we can use the stored hash value */ mze_destroy(zap); fzap_upgrade(zap, tx, flags); zap_name_t *zn = zap_name_alloc(zap, B_FALSE); for (int i = 0; i < nchunks; i++) { mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, (u_longlong_t)mze->mze_value); zap_name_init_str(zn, mze->mze_name, 0); /* If we fail here, we would end up losing entries */ VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tag, tx)); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ } zap_name_free(zn); vmem_free(mzp, sz); *zapp = zap; return (0); } /* * The "normflags" determine the behavior of the matchtype_t which is * passed to zap_lookup_norm(). Names which have the same normalized * version will be stored with the same hash value, and therefore we can * perform normalization-insensitive lookups. We can be Unicode form- * insensitive and/or case-insensitive. The following flags are valid for * "normflags": * * U8_TEXTPREP_NFC * U8_TEXTPREP_NFD * U8_TEXTPREP_NFKC * U8_TEXTPREP_NFKD * U8_TEXTPREP_TOUPPER * * The *_NF* (Normalization Form) flags are mutually exclusive; at most one * of them may be supplied. */ void mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) { dmu_buf_t *db; VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db, tx); mzap_phys_t *zp = db->db_data; zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL; zp->mz_normflags = normflags; if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ VERIFY(dnode_add_ref(dn, FTAG)); VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER, B_FALSE, B_FALSE, &zap)); VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); zap_unlockdir(zap, FTAG); } else { dmu_buf_rele(db, FTAG); } } static uint64_t zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { uint64_t obj; ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); if (allocated_dnode == NULL) { dnode_t *dn; obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, &dn, FTAG, tx); mzap_create_impl(dn, normflags, flags, tx); dnode_rele(dn, FTAG); } else { obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx); mzap_create_impl(*allocated_dnode, normflags, flags, tx); } return (obj); } int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, 0, tx)); } int zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_claim_norm_dnsize(os, obj, 0, ot, bonustype, bonuslen, dnodesize, tx)); } int zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, bonuslen, 0, tx)); } int zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { dnode_t *dn; int error; ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, dnodesize, tx); if (error != 0) return (error); error = dnode_hold(os, obj, FTAG, &dn); if (error != 0) return (error); mzap_create_impl(dn, normflags, 0, tx); dnode_rele(dn, FTAG); return (0); } uint64_t zap_create(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); } uint64_t zap_create_dnsize(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, dnodesize, tx)); } uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, 0, tx)); } uint64_t zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_impl(os, normflags, 0, ot, 0, 0, bonustype, bonuslen, dnodesize, NULL, NULL, tx)); } uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_flags_dnsize(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); } uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, tx)); } /* * Create a zap object and return a pointer to the newly allocated dnode via * the allocated_dnode argument. The returned dnode will be held and the * caller is responsible for releasing the hold by calling dnode_rele(). */ uint64_t zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx)); } int zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) { /* * dmu_object_free will free the object number and free the * data. Freeing the data will cause our pageout function to be * called, which will destroy our data (zap_leaf_t's and zap_t). */ return (dmu_object_free(os, zapobj, tx)); } void zap_evict_sync(void *dbu) { zap_t *zap = dbu; rw_destroy(&zap->zap_rwlock); if (zap->zap_ismicro) mze_destroy(zap); else mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); } int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); if (!zap->zap_ismicro) { err = fzap_count(zap, count); } else { *count = zap->zap_m.zap_num_entries; } zap_unlockdir(zap, FTAG); return (err); } /* * zn may be NULL; if not specified, it will be computed if needed. * See also the comment above zap_entry_normalization_conflict(). */ static boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, zfs_btree_index_t *idx) { boolean_t allocdzn = B_FALSE; mzap_ent_t *other; zfs_btree_index_t oidx; if (zap->zap_normflags == 0) return (B_FALSE); for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx); other && other->mze_hash == mze->mze_hash; other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) { if (zn == NULL) { zn = zap_name_alloc_str(zap, MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx); other && other->mze_hash == mze->mze_hash; other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) { if (zn == NULL) { zn = zap_name_alloc_str(zap, MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } if (allocdzn) zap_name_free(zn); return (B_FALSE); } /* * Routines for manipulating attributes. */ int zap_lookup(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm(os, zapobj, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } static int zap_lookup_impl(zap_t *zap, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { int err = 0; zap_name_t *zn = zap_name_alloc_str(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_lookup(zn, integer_size, num_integers, buf, realname, rn_len, ncp); } else { zfs_btree_index_t idx; mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (num_integers < 1) { err = SET_ERROR(EOVERFLOW); } else if (integer_size != 8) { err = SET_ERROR(EINVAL); } else { *(uint64_t *)buf = MZE_PHYS(zap, mze)->mze_value; if (realname != NULL) (void) strlcpy(realname, MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze, &idx); } } } } zap_name_free(zn); return (err); } int zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } fzap_prefetch(zn); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_prefetch_object(objset_t *os, uint64_t zapobj) { int error; dmu_object_info_t doi; error = dmu_object_info(os, zapobj, &doi); if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) error = SET_ERROR(EINVAL); if (error == 0) dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset); return (error); } int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm_by_dnode(dn, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } fzap_prefetch(zn); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_lookup(zn, integer_size, num_integers, buf, NULL, 0, NULL); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_contains(objset_t *os, uint64_t zapobj, const char *name) { int err = zap_lookup_norm(os, zapobj, name, 0, 0, NULL, 0, NULL, 0, NULL); if (err == EOVERFLOW || err == EINVAL) err = 0; /* found, but skipped reading the value */ return (err); } int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { zfs_btree_index_t idx; mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (integer_size) *integer_size = 8; if (num_integers) *num_integers = 1; } } zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_length(zn, integer_size, num_integers); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } static void mzap_addent(zap_name_t *zn, uint64_t value) { zap_t *zap = zn->zn_zap; uint16_t start = zap->zap_m.zap_alloc_next; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ ASSERT(cd < zap_maxcd(zap)); again: for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; (void) strlcpy(mze->mze_name, zn->zn_key_orig, sizeof (mze->mze_name)); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; mze_insert(zap, i, zn->zn_hash); return; } } if (start != 0) { start = 0; goto again; } cmn_err(CE_PANIC, "out of entries!"); } static int zap_add_impl(zap_t *zap, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, const void *tag) { const uint64_t *intval = val; int err = 0; zap_name_t *zn = zap_name_alloc_str(zap, key, 0); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_add(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN || !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); if (err == 0) { err = fzap_add(zn, integer_size, num_integers, val, tag, tx); } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { zfs_btree_index_t idx; if (mze_find(zn, &idx) != NULL) { err = SET_ERROR(EEXIST); } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap, tag); return (err); } int zap_add(objset_t *os, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); /* zap_add_impl() calls zap_unlockdir() */ return (err); } int zap_add_by_dnode(dnode_t *dn, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); /* zap_add_impl() calls zap_unlockdir() */ return (err); } static int zap_add_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, const void *tag) { int err; zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } err = fzap_add(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap, tag); return (err); } int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_uint64_impl(zap, key, key_numints, integer_size, num_integers, val, tx, FTAG); /* zap_add_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_uint64_impl(zap, key, key_numints, integer_size, num_integers, val, tx, FTAG); /* zap_add_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; const uint64_t *intval = val; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", (u_longlong_t)zapobj, integer_size, (u_longlong_t)num_integers, name); err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); if (err == 0) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); } zap = zn->zn_zap; /* fzap_update() may change zap */ } else { zfs_btree_index_t idx; mzap_ent_t *mze = mze_find(zn, &idx); if (mze != NULL) { MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, FTAG); return (err); } static int zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, const void *tag) { int err; zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } err = fzap_update(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, tag); return (err); } int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_update_uint64_impl(zap, key, key_numints, integer_size, num_integers, val, tx, FTAG); /* zap_update_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_update_uint64_impl(zap, key, key_numints, integer_size, num_integers, val, tx, FTAG); /* zap_update_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { return (zap_remove_norm(os, zapobj, name, 0, tx)); } static int zap_remove_impl(zap_t *zap, const char *name, matchtype_t mt, dmu_tx_t *tx) { int err = 0; zap_name_t *zn = zap_name_alloc_str(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { zfs_btree_index_t idx; mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { zap->zap_m.zap_num_entries--; memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); } } zap_name_free(zn); return (err); } int zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); err = zap_remove_impl(zap, name, mt, tx); zap_unlockdir(zap, FTAG); return (err); } int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); err = zap_remove_impl(zap, name, 0, tx); zap_unlockdir(zap, FTAG); return (err); } static int zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, dmu_tx_t *tx, const void *tag) { int err; zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } err = fzap_remove(zn, tx); zap_name_free(zn); zap_unlockdir(zap, tag); return (err); } int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); /* zap_remove_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); /* zap_remove_uint64_impl() calls zap_unlockdir() */ return (err); } static zap_attribute_t * zap_attribute_alloc_impl(boolean_t longname) { zap_attribute_t *za; za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache, KM_SLEEP); za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; return (za); } zap_attribute_t * zap_attribute_alloc(void) { return (zap_attribute_alloc_impl(B_FALSE)); } zap_attribute_t * zap_attribute_long_alloc(void) { return (zap_attribute_alloc_impl(B_TRUE)); } void zap_attribute_free(zap_attribute_t *za) { if (za->za_name_len == ZAP_MAXNAMELEN) { kmem_cache_free(zap_attr_cache, za); } else { ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW); kmem_cache_free(zap_attr_long_cache, za); } } /* * Routines for iterating over the attributes. */ static void zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized, boolean_t prefetch) { zc->zc_objset = os; zc->zc_zap = NULL; zc->zc_leaf = NULL; zc->zc_zapobj = zapobj; zc->zc_serialized = serialized; zc->zc_hash = 0; zc->zc_cd = 0; zc->zc_prefetch = prefetch; } void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized) { zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); } /* * Initialize a cursor at the beginning of the ZAP object. The entire * ZAP object will be prefetched. */ void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); } /* * Initialize a cursor at the beginning, but request that we not prefetch * the entire ZAP object. */ void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); } void zap_cursor_fini(zap_cursor_t *zc) { if (zc->zc_zap) { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); zap_unlockdir(zc->zc_zap, NULL); zc->zc_zap = NULL; } if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } zc->zc_objset = NULL; } uint64_t zap_cursor_serialize(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return (-1ULL); if (zc->zc_zap == NULL) return (zc->zc_serialized); ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); /* * We want to keep the high 32 bits of the cursor zero if we can, so * that 32-bit programs can access this. So usually use a small * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits * of the cursor. * * [ collision differentiator | zap_hashbits()-bit hash value ] */ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); } int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { int err; if (zc->zc_hash == -1ULL) return (SET_ERROR(ENOENT)); if (zc->zc_zap == NULL) { int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); if (err != 0) return (err); /* * To support zap_cursor_init_serialized, advance, retrieve, * we must add to the existing zc_cd, which may already * be 1 due to the zap_cursor_advance. */ ASSERT(zc->zc_hash == 0); hb = zap_hashbits(zc->zc_zap); zc->zc_hash = zc->zc_serialized << (64 - hb); zc->zc_cd += zc->zc_serialized >> hb; if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ zc->zc_cd = 0; } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { zfs_btree_index_t idx; mzap_ent_t mze_tofind; mze_tofind.mze_hash = zc->zc_hash >> 32; mze_tofind.mze_cd = zc->zc_cd; mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, &mze_tofind, &idx); if (mze == NULL) { mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, &idx, &idx); } if (mze) { mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze, &idx); za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mzep->mze_value; (void) strlcpy(za->za_name, mzep->mze_name, za->za_name_len); zc->zc_hash = (uint64_t)mze->mze_hash << 32; zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; err = SET_ERROR(ENOENT); } } rw_exit(&zc->zc_zap->zap_rwlock); return (err); } void zap_cursor_advance(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return; zc->zc_cd++; } int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); memset(zs, 0, sizeof (zap_stats_t)); if (zap->zap_ismicro) { zs->zs_blocksize = zap->zap_dbuf->db_size; zs->zs_num_entries = zap->zap_m.zap_num_entries; zs->zs_num_blocks = 1; } else { fzap_get_stats(zap, zs); } zap_unlockdir(zap, FTAG); return (0); } #if defined(_KERNEL) EXPORT_SYMBOL(zap_create); EXPORT_SYMBOL(zap_create_dnsize); EXPORT_SYMBOL(zap_create_norm); EXPORT_SYMBOL(zap_create_norm_dnsize); EXPORT_SYMBOL(zap_create_flags); EXPORT_SYMBOL(zap_create_flags_dnsize); EXPORT_SYMBOL(zap_create_claim); EXPORT_SYMBOL(zap_create_claim_norm); EXPORT_SYMBOL(zap_create_claim_norm_dnsize); EXPORT_SYMBOL(zap_create_hold); EXPORT_SYMBOL(zap_destroy); EXPORT_SYMBOL(zap_lookup); EXPORT_SYMBOL(zap_lookup_by_dnode); EXPORT_SYMBOL(zap_lookup_norm); EXPORT_SYMBOL(zap_lookup_uint64); EXPORT_SYMBOL(zap_contains); EXPORT_SYMBOL(zap_prefetch); EXPORT_SYMBOL(zap_prefetch_uint64); EXPORT_SYMBOL(zap_prefetch_object); EXPORT_SYMBOL(zap_add); EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_uint64); EXPORT_SYMBOL(zap_add_uint64_by_dnode); EXPORT_SYMBOL(zap_update); EXPORT_SYMBOL(zap_update_uint64); EXPORT_SYMBOL(zap_update_uint64_by_dnode); EXPORT_SYMBOL(zap_length); EXPORT_SYMBOL(zap_length_uint64); EXPORT_SYMBOL(zap_remove); EXPORT_SYMBOL(zap_remove_by_dnode); EXPORT_SYMBOL(zap_remove_norm); EXPORT_SYMBOL(zap_remove_uint64); EXPORT_SYMBOL(zap_remove_uint64_by_dnode); EXPORT_SYMBOL(zap_count); EXPORT_SYMBOL(zap_value_search); EXPORT_SYMBOL(zap_join); EXPORT_SYMBOL(zap_join_increment); EXPORT_SYMBOL(zap_add_int); EXPORT_SYMBOL(zap_remove_int); EXPORT_SYMBOL(zap_lookup_int); EXPORT_SYMBOL(zap_increment_int); EXPORT_SYMBOL(zap_add_int_key); EXPORT_SYMBOL(zap_lookup_int_key); EXPORT_SYMBOL(zap_increment); EXPORT_SYMBOL(zap_cursor_init); EXPORT_SYMBOL(zap_cursor_fini); EXPORT_SYMBOL(zap_cursor_retrieve); EXPORT_SYMBOL(zap_cursor_advance); EXPORT_SYMBOL(zap_cursor_serialize); EXPORT_SYMBOL(zap_cursor_init_serialized); EXPORT_SYMBOL(zap_get_stats); /* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, "Maximum micro ZAP size, before converting to a fat ZAP, in bytes"); #endif diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh index 4b0618017e38..e10d2936cd3d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh @@ -1,93 +1,93 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2012 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_create/zpool_create.shlib ################################################################################ # # Specifically disabling a feature, all other features should be enabled. # # 1. Loop through all existing features: # a. Create a new pool with '-o feature@XXX=disabled'. # b. Verify that every other feature is 'enabled' or 'active'. # ################################################################################ verify_runnable "global" function cleanup { datasetexists $TESTPOOL && log_must zpool destroy $TESTPOOL } function check_features { typeset feature="${1}" zpool get all ${TESTPOOL} | grep feature@ | while read line; do set -- $(echo "${line}") if [[ "feature@${feature}" == "${2}" ]]; then # Failure passed feature must be disabled. if [[ "${3}" != "disabled" ]]; then return 1; fi else # Failure other features must be enabled or active. if [[ "${3}" != "enabled" && "${3}" != "active" ]]; then return 2; fi fi done # All features enabled or active except the expected one. return 0 } log_onexit cleanup # Several representative features are tested to keep the test time short. # The features 'extensible_dataset' and 'enabled_txg' are intentionally # excluded because other features depend on them. set -A features \ "hole_birth" \ - "large_blocks" \ "large_dnode" \ + "longname" \ "userobj_accounting" typeset -i i=0 while (( $i < ${#features[*]} )); do log_assert "'zpool create' creates pools with ${features[i]} disabled" log_must zpool create -f -o "feature@${features[i]}=disabled" \ $TESTPOOL $DISKS log_must check_features "${features[i]}" log_must zpool destroy -f $TESTPOOL (( i = i+1 )) done log_pass "'zpool create -o feature@feature=disabled' disables features" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index b5bc46dce993..e1fe865b1d3b 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -1,115 +1,116 @@ # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # # Copyright (c) 2013, 2014 by Delphix. All rights reserved. # Copyright 2016 Nexenta Systems, Inc. All rights reserved. # # Set the expected properties of zpool typeset -a properties=( "size" "capacity" "altroot" "health" "guid" "load_guid" "version" "bootfs" "delegation" "autoreplace" "cachefile" "checkpoint" "failmode" "listsnapshots" "autoexpand" "dedupratio" "dedup_table_quota" "dedup_table_size" "free" "allocated" "readonly" "comment" "expandsize" "freeing" "fragmentation" "leaked" "multihost" "autotrim" "compatibility" "bcloneused" "bclonesaved" "bcloneratio" "feature@async_destroy" "feature@empty_bpobj" "feature@lz4_compress" "feature@multi_vdev_crash_dump" "feature@spacemap_histogram" "feature@enabled_txg" "feature@hole_birth" "feature@extensible_dataset" "feature@embedded_data" "feature@bookmarks" "feature@filesystem_limits" "feature@large_blocks" "feature@sha512" "feature@skein" "feature@edonr" "feature@device_removal" "feature@obsolete_counts" "feature@zpool_checkpoint" "feature@spacemap_v2" "feature@redaction_bookmarks" "feature@redacted_datasets" "feature@bookmark_written" "feature@log_spacemap" "feature@device_rebuild" "feature@draid" "feature@redaction_list_spill" ) if is_linux || is_freebsd; then properties+=( "ashift" "feature@large_dnode" "feature@userobj_accounting" "feature@encryption" "feature@project_quota" "feature@allocation_classes" "feature@resilver_defer" "feature@bookmark_v2" "feature@livelist" "feature@zstd_compress" "feature@zilsaxattr" "feature@head_errlog" "feature@blake3" "feature@block_cloning" "feature@vdev_zaps_v2" "feature@raidz_expansion" "feature@fast_dedup" "feature@longname" + "feature@large_microzap" ) fi