diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 53e1ecae3790..4877df4b114d 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -1,156 +1,157 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2024, Klara, Inc. */ #ifndef _ZFEATURE_COMMON_H #define _ZFEATURE_COMMON_H extern __attribute__((visibility("default"))) #include #include #include #ifdef __cplusplus extern "C" { #endif struct zfeature_info; typedef enum spa_feature { SPA_FEATURE_NONE = -1, SPA_FEATURE_ASYNC_DESTROY, SPA_FEATURE_EMPTY_BPOBJ, SPA_FEATURE_LZ4_COMPRESS, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, SPA_FEATURE_SPACEMAP_HISTOGRAM, SPA_FEATURE_ENABLED_TXG, SPA_FEATURE_HOLE_BIRTH, SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_EMBEDDED_DATA, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_FS_SS_LIMIT, SPA_FEATURE_LARGE_BLOCKS, SPA_FEATURE_LARGE_DNODE, SPA_FEATURE_SHA512, SPA_FEATURE_SKEIN, SPA_FEATURE_EDONR, SPA_FEATURE_USEROBJ_ACCOUNTING, SPA_FEATURE_ENCRYPTION, SPA_FEATURE_PROJECT_QUOTA, SPA_FEATURE_DEVICE_REMOVAL, SPA_FEATURE_OBSOLETE_COUNTS, SPA_FEATURE_POOL_CHECKPOINT, SPA_FEATURE_SPACEMAP_V2, SPA_FEATURE_ALLOCATION_CLASSES, SPA_FEATURE_RESILVER_DEFER, SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_REDACTION_BOOKMARKS, SPA_FEATURE_REDACTED_DATASETS, SPA_FEATURE_BOOKMARK_WRITTEN, SPA_FEATURE_LOG_SPACEMAP, SPA_FEATURE_LIVELIST, SPA_FEATURE_DEVICE_REBUILD, SPA_FEATURE_ZSTD_COMPRESS, SPA_FEATURE_DRAID, SPA_FEATURE_ZILSAXATTR, SPA_FEATURE_HEAD_ERRLOG, SPA_FEATURE_BLAKE3, SPA_FEATURE_BLOCK_CLONING, SPA_FEATURE_AVZ_V2, SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURE_RAIDZ_EXPANSION, SPA_FEATURE_FAST_DEDUP, SPA_FEATURE_LONGNAME, SPA_FEATURE_LARGE_MICROZAP, SPA_FEATURE_DYNAMIC_GANG_HEADER, + SPA_FEATURE_BLOCK_CLONING_ENDIAN, SPA_FEATURES } spa_feature_t; #define SPA_FEATURE_DISABLED (-1ULL) typedef enum zfeature_flags { /* Can open pool readonly even if this feature is not supported. */ ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0), /* * Is this feature necessary to load the pool? i.e. do we need this * feature to read the full feature list out of the MOS? */ ZFEATURE_FLAG_MOS = (1 << 1), /* Activate this feature at the same time it is enabled. */ ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2), /* Each dataset has a field set if it has ever used this feature. */ ZFEATURE_FLAG_PER_DATASET = (1 << 3), /* * This feature isn't enabled by zpool upgrade; it must be explicitly * listed to be enabled. It will also be applied if listed in an * explicitly provided compatibility list. This flag can be removed * from a given feature once support is sufficiently widespread, or * worries about backwards compatibility are no longer relevant. */ ZFEATURE_FLAG_NO_UPGRADE = (1 << 4) } zfeature_flags_t; typedef enum zfeature_type { ZFEATURE_TYPE_BOOLEAN, ZFEATURE_TYPE_UINT64_ARRAY, ZFEATURE_NUM_TYPES } zfeature_type_t; typedef struct zfeature_info { spa_feature_t fi_feature; const char *fi_uname; /* User-facing feature name */ const char *fi_guid; /* On-disk feature identifier */ const char *fi_desc; /* Feature description */ zfeature_flags_t fi_flags; boolean_t fi_zfs_mod_supported; /* supported by running zfs module */ zfeature_type_t fi_type; /* Only relevant for PER_DATASET features */ /* array of dependencies, terminated by SPA_FEATURE_NONE */ const spa_feature_t *fi_depends; } zfeature_info_t; typedef int (zfeature_func_t)(zfeature_info_t *, void *); #define ZFS_FEATURE_DEBUG _ZFEATURE_COMMON_H zfeature_info_t spa_feature_table[SPA_FEATURES]; _ZFEATURE_COMMON_H boolean_t zfeature_checks_disable; _ZFEATURE_COMMON_H boolean_t zfeature_is_valid_guid(const char *); _ZFEATURE_COMMON_H boolean_t zfeature_is_supported(const char *); _ZFEATURE_COMMON_H int zfeature_lookup_guid(const char *, spa_feature_t *); _ZFEATURE_COMMON_H int zfeature_lookup_name(const char *, spa_feature_t *); _ZFEATURE_COMMON_H boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t); _ZFEATURE_COMMON_H void zpool_feature_init(void); #ifdef __cplusplus } #endif #endif /* _ZFEATURE_COMMON_H */ diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 0c3e8106ca6d..bd2ab6468021 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -1,10364 +1,10365 @@ - + - + + - - + + - + diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 7ec27116440a..66aa100b7149 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -1,1074 +1,1085 @@ .\" SPDX-License-Identifier: CDDL-1.0 .\" .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at .\" usr/src/OPENSOLARIS.LICENSE or https://opensource.org/licenses/CDDL-1.0. .\" .\" See the License for the specific language governing permissions and .\" limitations under the License. When distributing Covered Code, include this .\" CDDL HEADER in each file and include the License file at .\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" Copyright (c) 2019, 2023, 2024, Klara, Inc. .\" Copyright (c) 2019, Allan Jude .\" Copyright (c) 2021, Colm Buckley .\" .Dd October 2, 2024 .Dt ZPOOL-FEATURES 7 .Os . .Sh NAME .Nm zpool-features .Nd description of ZFS pool features . .Sh DESCRIPTION ZFS pool on-disk format versions are specified via .Dq features which replace the old on-disk format numbers .Pq the last supported on-disk format number is 28 . To enable a feature on a pool use the .Nm zpool Cm upgrade , or set the .Sy feature Ns @ Ns Ar feature-name property to .Sy enabled . Please also see the .Sx Compatibility feature sets section for information on how sets of features may be enabled together. .Pp The pool format does not affect file system version compatibility or the ability to send file systems between pools. .Pp Since most features can be enabled independently of each other, the on-disk format of the pool is specified by the set of all features marked as .Sy active on the pool. If the pool was created by another software version this set may include unsupported features. . .Ss Identifying features Every feature has a GUID of the form .Ar com.example : Ns Ar feature-name . The reversed DNS name ensures that the feature's GUID is unique across all ZFS implementations. When unsupported features are encountered on a pool they will be identified by their GUIDs. Refer to the documentation for the ZFS implementation that created the pool for information about those features. .Pp Each supported feature also has a short name. By convention a feature's short name is the portion of its GUID which follows the .Sq \&: .Po i.e. .Ar com.example : Ns Ar feature-name would have the short name .Ar feature-name .Pc , however a feature's short name may differ across ZFS implementations if following the convention would result in name conflicts. . .Ss Feature states Features can be in one of three states: .Bl -tag -width "disabled" .It Sy active This feature's on-disk format changes are in effect on the pool. Support for this feature is required to import the pool in read-write mode. If this feature is not read-only compatible, support is also required to import the pool in read-only mode .Pq see Sx Read-only compatibility . .It Sy enabled An administrator has marked this feature as enabled on the pool, but the feature's on-disk format changes have not been made yet. The pool can still be imported by software that does not support this feature, but changes may be made to the on-disk format at any time which will move the feature to the .Sy active state. Some features may support returning to the .Sy enabled state after becoming .Sy active . See feature-specific documentation for details. .It Sy disabled This feature's on-disk format changes have not been made and will not be made unless an administrator moves the feature to the .Sy enabled state. Features cannot be disabled once they have been enabled. .El .Pp The state of supported features is exposed through pool properties of the form .Sy feature Ns @ Ns Ar short-name . . .Ss Read-only compatibility Some features may make on-disk format changes that do not interfere with other software's ability to read from the pool. These features are referred to as .Dq read-only compatible . If all unsupported features on a pool are read-only compatible, the pool can be imported in read-only mode by setting the .Sy readonly property during import .Po see .Xr zpool-import 8 for details on importing pools .Pc . . .Ss Unsupported features For each unsupported feature enabled on an imported pool, a pool property named .Sy unsupported Ns @ Ns Ar feature-name will indicate why the import was allowed despite the unsupported feature. Possible values for this property are: .Bl -tag -width "readonly" .It Sy inactive The feature is in the .Sy enabled state and therefore the pool's on-disk format is still compatible with software that does not support this feature. .It Sy readonly The feature is read-only compatible and the pool has been imported in read-only mode. .El . .Ss Feature dependencies Some features depend on other features being enabled in order to function. Enabling a feature will automatically enable any features it depends on. . .Ss Compatibility feature sets It is sometimes necessary for a pool to maintain compatibility with a specific on-disk format, by enabling and disabling particular features. The .Sy compatibility feature facilitates this by allowing feature sets to be read from text files. When set to .Sy off .Pq the default , compatibility feature sets are disabled .Pq i.e. all features are enabled ; when set to .Sy legacy , no features are enabled. When set to a comma-separated list of filenames .Po each filename may either be an absolute path, or relative to .Pa /etc/zfs/compatibility.d or .Pa /usr/share/zfs/compatibility.d .Pc , the lists of requested features are read from those files, separated by whitespace and/or commas. Only features present in all files are enabled. .Pp Simple sanity checks are applied to the files: they must be between 1 B and 16 KiB in size, and must end with a newline character. .Pp The requested features are applied when a pool is created using .Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar … and controls which features are enabled when using .Nm zpool Cm upgrade . .Nm zpool Cm status will not show a warning about disabled features which are not part of the requested feature set. .Pp The special value .Sy legacy prevents any features from being enabled, either via .Nm zpool Cm upgrade or .Nm zpool Cm set Sy feature Ns @ Ns Ar feature-name Ns = Ns Sy enabled . This setting also prevents pools from being upgraded to newer on-disk versions. This is a safety measure to prevent new features from being accidentally enabled, breaking compatibility. .Pp By convention, compatibility files in .Pa /usr/share/zfs/compatibility.d are provided by the distribution, and include feature sets supported by important versions of popular distributions, and feature sets commonly supported at the start of each year. Compatibility files in .Pa /etc/zfs/compatibility.d , if present, will take precedence over files with the same name in .Pa /usr/share/zfs/compatibility.d . .Pp If an unrecognized feature is found in these files, an error message will be shown. If the unrecognized feature is in a file in .Pa /etc/zfs/compatibility.d , this is treated as an error and processing will stop. If the unrecognized feature is under .Pa /usr/share/zfs/compatibility.d , this is treated as a warning and processing will continue. This difference is to allow distributions to include features which might not be recognized by the currently-installed binaries. .Pp Compatibility files may include comments: any text from .Sq # to the end of the line is ignored. .Pp .Sy Example : .Bd -literal -compact -offset 4n .No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2 # Features which are supported by GRUB2 versions from v2.12 onwards. allocation_classes async_destroy block_cloning bookmarks device_rebuild embedded_data empty_bpobj enabled_txg extensible_dataset filesystem_limits hole_birth large_blocks livelist log_spacemap lz4_compress project_quota resilver_defer spacemap_histogram spacemap_v2 userobj_accounting zilsaxattr zpool_checkpoint .No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2-2.06 # Features which are supported by GRUB2 versions prior to v2.12. # # GRUB is not able to detect ZFS pool if snapshot of top level boot pool # is created. This issue is observed with GRUB versions before v2.12 if # extensible_dataset feature is enabled on ZFS boot pool. # # This file lists all read-only compatible features except # extensible_dataset and any other feature that depends on it. # allocation_classes async_destroy block_cloning device_rebuild embedded_data empty_bpobj enabled_txg hole_birth log_spacemap lz4_compress resilver_defer spacemap_histogram spacemap_v2 zpool_checkpoint .No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev .Ed .Pp See .Xr zpool-create 8 and .Xr zpool-upgrade 8 for more information on how these commands are affected by feature sets. . .de feature .It Sy \\$2 .Bl -tag -compact -width "READ-ONLY COMPATIBLE" .It GUID .Sy \\$1:\\$2 .if !"\\$4"" \{\ .It DEPENDENCIES \fB\\$4\fP\c .if !"\\$5"" , \fB\\$5\fP\c .if !"\\$6"" , \fB\\$6\fP\c .if !"\\$7"" , \fB\\$7\fP\c .if !"\\$8"" , \fB\\$8\fP\c .if !"\\$9"" , \fB\\$9\fP\c .\} .It READ-ONLY COMPATIBLE \\$3 .El .Pp .. . .ds instant-never \ .No This feature becomes Sy active No as soon as it is enabled \ and will never return to being Sy enabled . . .ds remount-upgrade \ .No Each filesystem will be upgraded automatically when remounted, \ or when a new file is created under that filesystem. \ The upgrade can also be triggered on filesystems via \ Nm zfs Cm set Sy version Ns = Ns Sy current Ar fs . \ No The upgrade process runs in the background and may take a while to complete \ for filesystems containing large amounts of files . . .de checksum-spiel When the .Sy \\$1 feature is set to .Sy enabled , the administrator can turn on the .Sy \\$1 checksum on any dataset using .Nm zfs Cm set Sy checksum Ns = Ns Sy \\$1 Ar dset .Po see Xr zfs-set 8 Pc . This feature becomes .Sy active once a .Sy checksum property has been set to .Sy \\$1 , and will return to being .Sy enabled once all filesystems that have ever had their checksum set to .Sy \\$1 are destroyed. .. . .Sh FEATURES The following features are supported on this system: .Bl -tag -width Ds .feature org.zfsonlinux allocation_classes yes This feature enables support for separate allocation classes. .Pp This feature becomes .Sy active when a dedicated allocation class vdev .Pq dedup or special is created with the .Nm zpool Cm create No or Nm zpool Cm add No commands . With device removal, it can be returned to the .Sy enabled state if all the dedicated allocation class vdevs are removed. . .feature com.delphix async_destroy yes Destroying a file system requires traversing all of its data in order to return its used space to the pool. Without .Sy async_destroy , the file system is not fully removed until all space has been reclaimed. If the destroy operation is interrupted by a reboot or power outage, the next attempt to open the pool will need to complete the destroy operation synchronously. .Pp When .Sy async_destroy is enabled, the file system's data will be reclaimed by a background process, allowing the destroy operation to complete without traversing the entire file system. The background process is able to resume interrupted destroys after the pool has been opened, eliminating the need to finish interrupted destroys as part of the open operation. The amount of space remaining to be reclaimed by the background process is available through the .Sy freeing property. .Pp This feature is only .Sy active while .Sy freeing is non-zero. . .feature org.openzfs blake3 no extensible_dataset This feature enables the use of the BLAKE3 hash algorithm for checksum and dedup. BLAKE3 is a secure hash algorithm focused on high performance. .Pp .checksum-spiel blake3 . .feature com.fudosecurity block_cloning yes When this feature is enabled ZFS will use block cloning for operations like .Fn copy_file_range 2 . Block cloning allows to create multiple references to a single block. It is much faster than copying the data (as the actual data is neither read nor written) and takes no additional space. Blocks can be cloned across datasets under some conditions (like equal .Nm recordsize , the same master encryption key, etc.). ZFS tries its best to clone across datasets including encrypted ones. This is limited for various (nontrivial) reasons depending on the OS and/or ZFS internals. .Pp This feature becomes .Sy active when first block is cloned. When the last cloned block is freed, it goes back to the enabled state. +.feature com.truenas block_cloning_endian yes +This feature corrects ZAP entry endianness issues in the Block Reference +Table (BRT) used by block cloning. +During the original block cloning implementation, BRT ZAP entries were +mistakenly stored as arrays of 8 single-byte entries instead of single +8-byte entries, making pools non-endian-safe. +.Pp +This feature is activated when the first BRT ZAP is created (that way +ensuring compatibility with existing pools). +When active, new BRT entries are stored in the correct endian-safe format. +The feature becomes inactive when all BRT ZAPs are destroyed. .feature com.delphix bookmarks yes extensible_dataset This feature enables use of the .Nm zfs Cm bookmark command. .Pp This feature is .Sy active while any bookmarks exist in the pool. All bookmarks in the pool can be listed by running .Nm zfs Cm list Fl t Sy bookmark Fl r Ar poolname . . .feature com.datto bookmark_v2 no bookmark extensible_dataset This feature enables the creation and management of larger bookmarks which are needed for other features in ZFS. .Pp This feature becomes .Sy active when a v2 bookmark is created and will be returned to the .Sy enabled state when all v2 bookmarks are destroyed. . .feature com.delphix bookmark_written no bookmark extensible_dataset bookmark_v2 This feature enables additional bookmark accounting fields, enabling the .Sy written Ns # Ns Ar bookmark property .Pq space written since a bookmark and estimates of send stream sizes for incrementals from bookmarks. .Pp This feature becomes .Sy active when a bookmark is created and will be returned to the .Sy enabled state when all bookmarks with these fields are destroyed. . .feature org.openzfs device_rebuild yes This feature enables the ability for the .Nm zpool Cm attach and .Nm zpool Cm replace commands to perform sequential reconstruction .Pq instead of healing reconstruction when resilvering. .Pp Sequential reconstruction resilvers a device in LBA order without immediately verifying the checksums. Once complete, a scrub is started, which then verifies the checksums. This approach allows full redundancy to be restored to the pool in the minimum amount of time. This two-phase approach will take longer than a healing resilver when the time to verify the checksums is included. However, unless there is additional pool damage, no checksum errors should be reported by the scrub. This feature is incompatible with raidz configurations. . This feature becomes .Sy active while a sequential resilver is in progress, and returns to .Sy enabled when the resilver completes. . .feature com.delphix device_removal no This feature enables the .Nm zpool Cm remove command to remove top-level vdevs, evacuating them to reduce the total size of the pool. .Pp This feature becomes .Sy active when the .Nm zpool Cm remove command is used on a top-level vdev, and will never return to being .Sy enabled . . .feature org.openzfs draid no This feature enables use of the .Sy draid vdev type. dRAID is a variant of RAID-Z which provides integrated distributed hot spares that allow faster resilvering while retaining the benefits of RAID-Z. Data, parity, and spare space are organized in redundancy groups and distributed evenly over all of the devices. .Pp This feature becomes .Sy active when creating a pool which uses the .Sy draid vdev type, or when adding a new .Sy draid vdev to an existing pool. . .feature com.klarasystems dynamic_gang_header no This feature enables larger gang headers based on the sector size of the pool. When enabled, gang headers will use the entire space allocated for them, instead of always restricting themselves to 512 bytes. This can reduce the need for nested gang trees in extreme fragmentation scenarios. .Pp This feature becomes active when a gang header is written that is larger than 512 bytes. This feature is not enabled by .Xr zpool-upgrade 8 . Instead, it must be manually enabled, or be part of a compatibility file. . .feature org.illumos edonr no extensible_dataset This feature enables the use of the Edon-R hash algorithm for checksum, including for nopwrite .Po if compression is also enabled, an overwrite of a block whose checksum matches the data being written will be ignored .Pc . In an abundance of caution, Edon-R requires verification when used with dedup: .Nm zfs Cm set Sy dedup Ns = Ns Sy edonr , Ns Sy verify .Po see Xr zfs-set 8 Pc . .Pp Edon-R is a very high-performance hash algorithm that was part of the NIST SHA-3 competition. It provides extremely high hash performance .Pq over 350% faster than SHA-256 , but was not selected because of its unsuitability as a general purpose secure hash algorithm. This implementation utilizes the new salted checksumming functionality in ZFS, which means that the checksum is pre-seeded with a secret 256-bit random key .Pq stored on the pool before being fed the data block to be checksummed. Thus the produced checksums are unique to a given pool, preventing hash collision attacks on systems with dedup. .Pp .checksum-spiel edonr . .feature com.delphix embedded_data no This feature improves the performance and compression ratio of highly-compressible blocks. Blocks whose contents can compress to 112 bytes or smaller can take advantage of this feature. .Pp When this feature is enabled, the contents of highly-compressible blocks are stored in the block .Dq pointer itself .Po a misnomer in this case, as it contains the compressed data, rather than a pointer to its location on disk .Pc . Thus the space of the block .Pq one sector, typically 512 B or 4 KiB is saved, and no additional I/O is needed to read and write the data block. . \*[instant-never] . .feature com.delphix empty_bpobj yes This feature increases the performance of creating and using a large number of snapshots of a single filesystem or volume, and also reduces the disk space required. .Pp When there are many snapshots, each snapshot uses many Block Pointer Objects .Pq bpobjs to track blocks associated with that snapshot. However, in common use cases, most of these bpobjs are empty. This feature allows us to create each bpobj on-demand, thus eliminating the empty bpobjs. .Pp This feature is .Sy active while there are any filesystems, volumes, or snapshots which were created after enabling this feature. . .feature com.delphix enabled_txg yes Once this feature is enabled, ZFS records the transaction group number in which new features are enabled. This has no user-visible impact, but other features may depend on this feature. .Pp This feature becomes .Sy active as soon as it is enabled and will never return to being .Sy enabled . . .feature com.datto encryption no bookmark_v2 extensible_dataset This feature enables the creation and management of natively encrypted datasets. .Pp This feature becomes .Sy active when an encrypted dataset is created and will be returned to the .Sy enabled state when all datasets that use this feature are destroyed. . .feature com.klarasystems fast_dedup yes This feature allows more advanced deduplication features to be enabled on new dedup tables. .Pp This feature will be .Sy active when the first deduplicated block is written after a new dedup table is created (i.e. after a new pool creation, or new checksum used on a dataset with .Sy dedup enabled). It will be returned to the .Sy enabled state when all deduplicated blocks using it are freed. . .feature com.delphix extensible_dataset no This feature allows more flexible use of internal ZFS data structures, and exists for other features to depend on. .Pp This feature will be .Sy active when the first dependent feature uses it, and will be returned to the .Sy enabled state when all datasets that use this feature are destroyed. . .feature com.joyent filesystem_limits yes extensible_dataset This feature enables filesystem and snapshot limits. These limits can be used to control how many filesystems and/or snapshots can be created at the point in the tree on which the limits are set. .Pp This feature is .Sy active once either of the limit properties has been set on a dataset and will never return to being .Sy enabled . . .feature com.delphix head_errlog no This feature enables the upgraded version of errlog, which required an on-disk error log format change. Now the error log of each head dataset is stored separately in the zap object and keyed by the head id. With this feature enabled, every dataset affected by an error block is listed in the output of .Nm zpool Cm status . In case of encrypted filesystems with unloaded keys we are unable to check their snapshots or clones for errors and these will not be reported. An "access denied" error will be reported. .Pp \*[instant-never] . .feature com.delphix hole_birth no enabled_txg This feature has/had bugs, the result of which is that, if you do a .Nm zfs Cm send Fl i .Pq or Fl R , No since it uses Fl i from an affected dataset, the receiving party will not see any checksum or other errors, but the resulting destination snapshot will not match the source. Its use by .Nm zfs Cm send Fl i has been disabled by default .Po see .Sy send_holes_without_birth_time in .Xr zfs 4 .Pc . .Pp This feature improves performance of incremental sends .Pq Nm zfs Cm send Fl i and receives for objects with many holes. The most common case of hole-filled objects is zvols. .Pp An incremental send stream from snapshot .Sy A No to snapshot Sy B contains information about every block that changed between .Sy A No and Sy B . Blocks which did not change between those snapshots can be identified and omitted from the stream using a piece of metadata called the .Dq block birth time , but birth times are not recorded for holes .Pq blocks filled only with zeroes . Since holes created after .Sy A No cannot be distinguished from holes created before Sy A , information about every hole in the entire filesystem or zvol is included in the send stream. .Pp For workloads where holes are rare this is not a problem. However, when incrementally replicating filesystems or zvols with many holes .Pq for example a zvol formatted with another filesystem a lot of time will be spent sending and receiving unnecessary information about holes that already exist on the receiving side. .Pp Once the .Sy hole_birth feature has been enabled the block birth times of all new holes will be recorded. Incremental sends between snapshots created after this feature is enabled will use this new metadata to avoid sending information about holes that already exist on the receiving side. .Pp \*[instant-never] . .feature org.open-zfs large_blocks no extensible_dataset This feature allows the record size on a dataset to be set larger than 128 KiB. .Pp This feature becomes .Sy active once a dataset contains a file with a block size larger than 128 KiB, and will return to being .Sy enabled once all filesystems that have ever had their recordsize larger than 128 KiB are destroyed. . .feature org.zfsonlinux large_dnode no extensible_dataset This feature allows the size of dnodes in a dataset to be set larger than 512 B. . This feature becomes .Sy active once a dataset contains an object with a dnode larger than 512 B, which occurs as a result of setting the .Sy dnodesize dataset property to a value other than .Sy legacy . The feature will return to being .Sy enabled once all filesystems that have ever contained a dnode larger than 512 B are destroyed. Large dnodes allow more data to be stored in the bonus buffer, thus potentially improving performance by avoiding the use of spill blocks. . .feature com.klarasystems large_microzap yes extensible_dataset large_blocks This feature allows "micro" ZAPs to grow larger than 128 KiB without being upgraded to "fat" ZAPs. .Pp This feature becomes .Sy active the first time a micro ZAP grows larger than 128KiB. It will only be returned to the .Sy enabled state when all datasets that ever had a large micro ZAP are destroyed. .Pp Note that even when this feature is enabled, micro ZAPs cannot grow larger than 128 KiB without also changing the .Sy zap_micro_max_size module parameter. See .Xr zfs 4 . . .feature com.delphix livelist yes extensible_dataset This feature allows clones to be deleted faster than the traditional method when a large number of random/sparse writes have been made to the clone. All blocks allocated and freed after a clone is created are tracked by the the clone's livelist which is referenced during the deletion of the clone. The feature is activated when a clone is created and remains .Sy active until all clones have been destroyed. . .feature com.delphix log_spacemap yes com.delphix:spacemap_v2 This feature improves performance for heavily-fragmented pools, especially when workloads are heavy in random-writes. It does so by logging all the metaslab changes on a single spacemap every TXG instead of scattering multiple writes to all the metaslab spacemaps. .Pp \*[instant-never] . .feature org.zfsonlinux longname no extensible_dataset This feature allows creating files and directories with name up to 1023 bytes in length. A new dataset property .Sy longname is also introduced to toggle longname support for each dataset individually. This property can be disabled even if it contains longname files. In such case, new file cannot be created with longname but existing longname files can still be looked up. .Pp This feature becomes .Sy active when a file name greater than 255 is created in a dataset, and returns to being .Sy enabled when all such datasets are destroyed. . .feature org.illumos lz4_compress no .Sy lz4 is a high-performance real-time compression algorithm that features significantly faster compression and decompression as well as a higher compression ratio than the older .Sy lzjb compression. Typically, .Sy lz4 compression is approximately 50% faster on compressible data and 200% faster on incompressible data than .Sy lzjb . It is also approximately 80% faster on decompression, while giving approximately a 10% better compression ratio. .Pp When the .Sy lz4_compress feature is set to .Sy enabled , the administrator can turn on .Sy lz4 compression on any dataset on the pool using the .Xr zfs-set 8 command. All newly written metadata will be compressed with the .Sy lz4 algorithm. .Pp \*[instant-never] . .feature com.joyent multi_vdev_crash_dump no This feature allows a dump device to be configured with a pool comprised of multiple vdevs. Those vdevs may be arranged in any mirrored or raidz configuration. .Pp When the .Sy multi_vdev_crash_dump feature is set to .Sy enabled , the administrator can use .Xr dumpadm 8 to configure a dump device on a pool comprised of multiple vdevs. .Pp Under .Fx and Linux this feature is unused, but registered for compatibility. New pools created on these systems will have the feature .Sy enabled but will never transition to .Sy active , as this functionality is not required for crash dump support. Existing pools where this feature is .Sy active can be imported. . .feature com.delphix obsolete_counts yes device_removal This feature is an enhancement of .Sy device_removal , which will over time reduce the memory used to track removed devices. When indirect blocks are freed or remapped, we note that their part of the indirect mapping is .Dq obsolete – no longer needed. .Pp This feature becomes .Sy active when the .Nm zpool Cm remove command is used on a top-level vdev, and will never return to being .Sy enabled . . .feature org.zfsonlinux project_quota yes extensible_dataset This feature allows administrators to account the spaces and objects usage information against the project identifier .Pq ID . .Pp The project ID is an object-based attribute. When upgrading an existing filesystem, objects without a project ID will be assigned a zero project ID. When this feature is enabled, newly created objects inherit their parent directories' project ID if the parent's inherit flag is set .Pq via Nm chattr Sy [+-]P No or Nm zfs Cm project Fl s Ns | Ns Fl C . Otherwise, the new object's project ID will be zero. An object's project ID can be changed at any time by the owner .Pq or privileged user via .Nm chattr Fl p Ar prjid or .Nm zfs Cm project Fl p Ar prjid . .Pp This feature will become .Sy active as soon as it is enabled and will never return to being .Sy disabled . \*[remount-upgrade] . .feature org.openzfs raidz_expansion no none This feature enables the .Nm zpool Cm attach subcommand to attach a new device to a RAID-Z group, expanding the total amount usable space in the pool. See .Xr zpool-attach 8 . . .feature com.delphix redaction_bookmarks no bookmarks extensible_dataset This feature enables the use of redacted .Nm zfs Cm send Ns s , which create redaction bookmarks storing the list of blocks redacted by the send that created them. For more information about redacted sends, see .Xr zfs-send 8 . . .feature com.delphix redacted_datasets no extensible_dataset This feature enables the receiving of redacted .Nm zfs Cm send streams, which create redacted datasets when received. These datasets are missing some of their blocks, and so cannot be safely mounted, and their contents cannot be safely read. For more information about redacted receives, see .Xr zfs-send 8 . . .feature com.delphix redaction_list_spill no redaction_bookmarks This feature enables the redaction list created by zfs redact to store many more entries. It becomes .Sy active when a redaction list is created with more than 36 entries, and returns to being .Sy enabled when no long redaction lists remain in the pool. For more information about redacted sends, see .Xr zfs-send 8 . . .feature com.datto resilver_defer yes This feature allows ZFS to postpone new resilvers if an existing one is already in progress. Without this feature, any new resilvers will cause the currently running one to be immediately restarted from the beginning. .Pp This feature becomes .Sy active once a resilver has been deferred, and returns to being .Sy enabled when the deferred resilver begins. . .feature org.illumos sha512 no extensible_dataset This feature enables the use of the SHA-512/256 truncated hash algorithm .Pq FIPS 180-4 for checksum and dedup. The native 64-bit arithmetic of SHA-512 provides an approximate 50% performance boost over SHA-256 on 64-bit hardware and is thus a good minimum-change replacement candidate for systems where hash performance is important, but these systems cannot for whatever reason utilize the faster .Sy skein No and Sy edonr algorithms. .Pp .checksum-spiel sha512 . .feature org.illumos skein no extensible_dataset This feature enables the use of the Skein hash algorithm for checksum and dedup. Skein is a high-performance secure hash algorithm that was a finalist in the NIST SHA-3 competition. It provides a very high security margin and high performance on 64-bit hardware .Pq 80% faster than SHA-256 . This implementation also utilizes the new salted checksumming functionality in ZFS, which means that the checksum is pre-seeded with a secret 256-bit random key .Pq stored on the pool before being fed the data block to be checksummed. Thus the produced checksums are unique to a given pool, preventing hash collision attacks on systems with dedup. .Pp .checksum-spiel skein . .feature com.delphix spacemap_histogram yes This features allows ZFS to maintain more information about how free space is organized within the pool. If this feature is .Sy enabled , it will be activated when a new space map object is created, or an existing space map is upgraded to the new format, and never returns back to being .Sy enabled . . .feature com.delphix spacemap_v2 yes This feature enables the use of the new space map encoding which consists of two words .Pq instead of one whenever it is advantageous. The new encoding allows space maps to represent large regions of space more efficiently on-disk while also increasing their maximum addressable offset. .Pp This feature becomes .Sy active once it is .Sy enabled , and never returns back to being .Sy enabled . . .feature org.zfsonlinux userobj_accounting yes extensible_dataset This feature allows administrators to account the object usage information by user and group. .Pp \*[instant-never] \*[remount-upgrade] . .feature com.klarasystems vdev_zaps_v2 no This feature creates a ZAP object for the root vdev. .Pp This feature becomes active after the next .Nm zpool Cm import or .Nm zpool reguid . . Properties can be retrieved or set on the root vdev using .Nm zpool Cm get and .Nm zpool Cm set with .Sy root as the vdev name which is an alias for .Sy root-0 . .feature org.openzfs zilsaxattr yes extensible_dataset This feature enables .Sy xattr Ns = Ns Sy sa extended attribute logging in the ZIL. If enabled, extended attribute changes .Pq both Sy xattrdir Ns = Ns Sy dir No and Sy xattr Ns = Ns Sy sa are guaranteed to be durable if either the dataset had .Sy sync Ns = Ns Sy always set at the time the changes were made, or .Xr sync 2 is called on the dataset after the changes were made. .Pp This feature becomes .Sy active when a ZIL is created for at least one dataset and will be returned to the .Sy enabled state when it is destroyed for all datasets that use this feature. . .feature com.delphix zpool_checkpoint yes This feature enables the .Nm zpool Cm checkpoint command that can checkpoint the state of the pool at the time it was issued and later rewind back to it or discard it. .Pp This feature becomes .Sy active when the .Nm zpool Cm checkpoint command is used to checkpoint the pool. The feature will only return back to being .Sy enabled when the pool is rewound or the checkpoint has been discarded. . .feature org.freebsd zstd_compress no extensible_dataset .Sy zstd is a high-performance compression algorithm that features a combination of high compression ratios and high speed. Compared to .Sy gzip , .Sy zstd offers slightly better compression at much higher speeds. Compared to .Sy lz4 , .Sy zstd offers much better compression while being only modestly slower. Typically, .Sy zstd compression speed ranges from 250 to 500 MB/s per thread and decompression speed is over 1 GB/s per thread. .Pp When the .Sy zstd feature is set to .Sy enabled , the administrator can turn on .Sy zstd compression of any dataset using .Nm zfs Cm set Sy compress Ns = Ns Sy zstd Ar dset .Po see Xr zfs-set 8 Pc . This feature becomes .Sy active once a .Sy compress property has been set to .Sy zstd , and will return to being .Sy enabled once all filesystems that have ever had their .Sy compress property set to .Sy zstd are destroyed. .El . .Sh SEE ALSO .Xr zfs 8 , .Xr zpool 8 diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 8ac1c7cabd6f..0b37530b0e11 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -1,806 +1,812 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude */ #ifndef _KERNEL #include #include #include #include #include #endif #include #include #include #include #include #include #include "zfeature_common.h" /* * Set to disable all feature checks while opening pools, allowing pools with * unsupported features to be opened. Set for testing only. */ boolean_t zfeature_checks_disable = B_FALSE; zfeature_info_t spa_feature_table[SPA_FEATURES]; /* * Valid characters for feature guids. This list is mainly for aesthetic * purposes and could be expanded in the future. There are different allowed * characters in the guids reverse dns portion (before the colon) and its * short name (after the colon). */ static int valid_char(char c, boolean_t after_colon) { return ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || (after_colon && c == '_') || (!after_colon && (c == '.' || c == '-'))); } /* * Every feature guid must contain exactly one colon which separates a reverse * dns organization name from the feature's "short" name (e.g. * "com.company:feature_name"). */ boolean_t zfeature_is_valid_guid(const char *name) { int i; boolean_t has_colon = B_FALSE; i = 0; while (name[i] != '\0') { char c = name[i++]; if (c == ':') { if (has_colon) return (B_FALSE); has_colon = B_TRUE; continue; } if (!valid_char(c, has_colon)) return (B_FALSE); } return (has_colon); } boolean_t zfeature_is_supported(const char *guid) { if (zfeature_checks_disable) return (B_TRUE); for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *feature = &spa_feature_table[i]; if (!feature->fi_zfs_mod_supported) continue; if (strcmp(guid, feature->fi_guid) == 0) return (B_TRUE); } return (B_FALSE); } int zfeature_lookup_guid(const char *guid, spa_feature_t *res) { for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *feature = &spa_feature_table[i]; if (!feature->fi_zfs_mod_supported) continue; if (strcmp(guid, feature->fi_guid) == 0) { if (res != NULL) *res = i; return (0); } } return (ENOENT); } int zfeature_lookup_name(const char *name, spa_feature_t *res) { for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *feature = &spa_feature_table[i]; if (!feature->fi_zfs_mod_supported) continue; if (strcmp(name, feature->fi_uname) == 0) { if (res != NULL) *res = i; return (0); } } return (ENOENT); } boolean_t zfeature_depends_on(spa_feature_t fid, spa_feature_t check) { zfeature_info_t *feature = &spa_feature_table[fid]; for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) { if (feature->fi_depends[i] == check) return (B_TRUE); } return (B_FALSE); } static boolean_t deps_contains_feature(const spa_feature_t *deps, const spa_feature_t feature) { for (int i = 0; deps[i] != SPA_FEATURE_NONE; i++) if (deps[i] == feature) return (B_TRUE); return (B_FALSE); } #define STRCMP ((int(*)(const void *, const void *))&strcmp) struct zfs_mod_supported_features { void *tree; boolean_t all_features; }; struct zfs_mod_supported_features * zfs_mod_list_supported(const char *scope) { #if defined(__FreeBSD__) || defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) (void) scope; return (NULL); #else struct zfs_mod_supported_features *ret = calloc(1, sizeof (*ret)); if (ret == NULL) return (NULL); DIR *sysfs_dir = NULL; char path[128]; if (snprintf(path, sizeof (path), "%s/%s", ZFS_SYSFS_DIR, scope) < sizeof (path)) sysfs_dir = opendir(path); if (sysfs_dir == NULL && errno == ENOENT) { if (snprintf(path, sizeof (path), "%s/%s", ZFS_SYSFS_ALT_DIR, scope) < sizeof (path)) sysfs_dir = opendir(path); } if (sysfs_dir == NULL) { ret->all_features = errno == ENOENT && (access(ZFS_SYSFS_DIR, F_OK) == 0 || access(ZFS_SYSFS_ALT_DIR, F_OK) == 0); return (ret); } struct dirent *node; while ((node = readdir(sysfs_dir)) != NULL) { if (strcmp(node->d_name, ".") == 0 || strcmp(node->d_name, "..") == 0) continue; char *name = strdup(node->d_name); if (name == NULL) { goto nomem; } if (tsearch(name, &ret->tree, STRCMP) == NULL) { /* * Don't bother checking for duplicate entries: * we're iterating a single directory. */ free(name); goto nomem; } } end: closedir(sysfs_dir); return (ret); nomem: zfs_mod_list_supported_free(ret); ret = NULL; goto end; #endif } void zfs_mod_list_supported_free(struct zfs_mod_supported_features *list) { #if !defined(__FreeBSD__) && !defined(_KERNEL) && !defined(LIB_ZPOOL_BUILD) if (list) { tdestroy(list->tree, free); free(list); } #else (void) list; #endif } #if !defined(_KERNEL) && !defined(LIB_ZPOOL_BUILD) static boolean_t zfs_mod_supported_impl(const char *scope, const char *name, const char *sysfs) { char path[128]; if (snprintf(path, sizeof (path), "%s%s%s%s%s", sysfs, scope == NULL ? "" : "/", scope ?: "", name == NULL ? "" : "/", name ?: "") < sizeof (path)) return (access(path, F_OK) == 0); else return (B_FALSE); } boolean_t zfs_mod_supported(const char *scope, const char *name, const struct zfs_mod_supported_features *sfeatures) { boolean_t supported; if (sfeatures != NULL) return (sfeatures->all_features || tfind(name, &sfeatures->tree, STRCMP)); /* * Check both the primary and alternate sysfs locations to determine * if the required functionality is supported. */ supported = (zfs_mod_supported_impl(scope, name, ZFS_SYSFS_DIR) || zfs_mod_supported_impl(scope, name, ZFS_SYSFS_ALT_DIR)); /* * For backwards compatibility with kernel modules that predate * supported feature/property checking. Report the feature/property * as supported if the kernel module is loaded but the requested * scope directory does not exist. */ if (supported == B_FALSE) { if ((access(ZFS_SYSFS_DIR, F_OK) == 0 && !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_DIR)) || (access(ZFS_SYSFS_ALT_DIR, F_OK) == 0 && !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_ALT_DIR))) { supported = B_TRUE; } } return (supported); } #endif static boolean_t zfs_mod_supported_feature(const char *name, const struct zfs_mod_supported_features *sfeatures) { /* * The zfs module spa_feature_table[], whether in-kernel or in * libzpool, always supports all the features. libzfs needs to * query the running module, via sysfs, to determine which * features are supported. * * The equivalent _can_ be done on FreeBSD by way of the sysctl * tree, but this has not been done yet. Therefore, we return * that all features are supported. */ #if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || defined(__FreeBSD__) (void) name, (void) sfeatures; return (B_TRUE); #else return (zfs_mod_supported(ZFS_SYSFS_POOL_FEATURES, name, sfeatures)); #endif } static void zfeature_register(spa_feature_t fid, const char *guid, const char *name, const char *desc, zfeature_flags_t flags, zfeature_type_t type, const spa_feature_t *deps, const struct zfs_mod_supported_features *sfeatures) { zfeature_info_t *feature = &spa_feature_table[fid]; static const spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; ASSERT(name != NULL); ASSERT(desc != NULL); ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 || (flags & ZFEATURE_FLAG_MOS) == 0); ASSERT3U(fid, <, SPA_FEATURES); ASSERT(zfeature_is_valid_guid(guid)); if (deps == NULL) deps = nodeps; VERIFY(((flags & ZFEATURE_FLAG_PER_DATASET) == 0) || (deps_contains_feature(deps, SPA_FEATURE_EXTENSIBLE_DATASET))); feature->fi_feature = fid; feature->fi_guid = guid; feature->fi_uname = name; feature->fi_desc = desc; feature->fi_flags = flags; feature->fi_type = type; feature->fi_depends = deps; feature->fi_zfs_mod_supported = zfs_mod_supported_feature(guid, sfeatures); } /* * Every feature has a GUID of the form com.example:feature_name. The * reversed DNS name ensures that the feature's GUID is unique across all ZFS * implementations. This allows companies to independently develop and * release features. Examples include org.delphix and org.datto. Previously, * features developed on one implementation have used that implementation's * domain name (e.g. org.illumos and org.zfsonlinux). Use of the org.openzfs * domain name is recommended for new features which are developed by the * OpenZFS community and its platforms. This domain may optionally be used by * companies developing features for initial release through an OpenZFS * implementation. Use of the org.openzfs domain requires reserving the * feature name in advance with the OpenZFS project. */ void zpool_feature_init(void) { struct zfs_mod_supported_features *sfeatures = zfs_mod_list_supported(ZFS_SYSFS_POOL_FEATURES); zfeature_register(SPA_FEATURE_ASYNC_DESTROY, "com.delphix:async_destroy", "async_destroy", "Destroy filesystems asynchronously.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_EMPTY_BPOBJ, "com.delphix:empty_bpobj", "empty_bpobj", "Snapshots use less space.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_LZ4_COMPRESS, "org.illumos:lz4_compress", "lz4_compress", "LZ4 compression algorithm support.", ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump", "Crash dumps to multiple vdev pools.", 0, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, "com.delphix:spacemap_histogram", "spacemap_histogram", "Spacemaps maintain space histograms.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_ENABLED_TXG, "com.delphix:enabled_txg", "enabled_txg", "Record txg at which a feature is enabled", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_HOLE_BIRTH, "com.delphix:hole_birth", "hole_birth", "Retain hole birth txg for more precise zfs send", ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, hole_birth_deps, sfeatures); } zfeature_register(SPA_FEATURE_POOL_CHECKPOINT, "com.delphix:zpool_checkpoint", "zpool_checkpoint", "Pool state can be checkpointed, allowing rewind later.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_SPACEMAP_V2, "com.delphix:spacemap_v2", "spacemap_v2", "Space maps representing large segments are more efficient.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, "com.delphix:extensible_dataset", "extensible_dataset", "Enhanced dataset functionality, used by other features.", 0, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t bookmarks_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_BOOKMARKS, "com.delphix:bookmarks", "bookmarks", "\"zfs bookmark\" command", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, bookmarks_deps, sfeatures); } { static const spa_feature_t filesystem_limits_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_FS_SS_LIMIT, "com.joyent:filesystem_limits", "filesystem_limits", "Filesystem and snapshot limits.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, filesystem_limits_deps, sfeatures); } zfeature_register(SPA_FEATURE_EMBEDDED_DATA, "com.delphix:embedded_data", "embedded_data", "Blocks which compress very well use even less space.", ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t livelist_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LIVELIST, "com.delphix:livelist", "livelist", "Improved clone deletion performance.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, livelist_deps, sfeatures); } { static const spa_feature_t log_spacemap_deps[] = { SPA_FEATURE_SPACEMAP_V2, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LOG_SPACEMAP, "com.delphix:log_spacemap", "log_spacemap", "Log metaslab changes on a single spacemap and " "flush them periodically.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, log_spacemap_deps, sfeatures); } { static const spa_feature_t large_blocks_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LARGE_BLOCKS, "org.open-zfs:large_blocks", "large_blocks", "Support for blocks larger than 128KB.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, large_blocks_deps, sfeatures); } { static const spa_feature_t large_dnode_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LARGE_DNODE, "org.zfsonlinux:large_dnode", "large_dnode", "Variable on-disk size of dnodes.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, large_dnode_deps, sfeatures); } { static const spa_feature_t sha512_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_SHA512, "org.illumos:sha512", "sha512", "SHA-512/256 hash algorithm.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, sha512_deps, sfeatures); } { static const spa_feature_t skein_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_SKEIN, "org.illumos:skein", "skein", "Skein hash algorithm.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, skein_deps, sfeatures); } { static const spa_feature_t edonr_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_EDONR, "org.illumos:edonr", "edonr", "Edon-R hash algorithm.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, edonr_deps, sfeatures); } { static const spa_feature_t redact_books_deps[] = { SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_REDACTION_BOOKMARKS, "com.delphix:redaction_bookmarks", "redaction_bookmarks", "Support for bookmarks which store redaction lists for zfs " "redacted send/recv.", 0, ZFEATURE_TYPE_BOOLEAN, redact_books_deps, sfeatures); } { static const spa_feature_t redact_datasets_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_REDACTED_DATASETS, "com.delphix:redacted_datasets", "redacted_datasets", "Support for redacted datasets, produced by receiving " "a redacted zfs send stream.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_UINT64_ARRAY, redact_datasets_deps, sfeatures); } { static const spa_feature_t bookmark_written_deps[] = { SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_BOOKMARK_WRITTEN, "com.delphix:bookmark_written", "bookmark_written", "Additional accounting, enabling the written# " "property (space written since a bookmark), " "and estimates of send stream sizes for incrementals from " "bookmarks.", 0, ZFEATURE_TYPE_BOOLEAN, bookmark_written_deps, sfeatures); } zfeature_register(SPA_FEATURE_DEVICE_REMOVAL, "com.delphix:device_removal", "device_removal", "Top-level vdevs can be removed, reducing logical pool size.", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t obsolete_counts_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_DEVICE_REMOVAL, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS, "com.delphix:obsolete_counts", "obsolete_counts", "Reduce memory used by removed devices when their blocks " "are freed or remapped.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, obsolete_counts_deps, sfeatures); } { static const spa_feature_t userobj_accounting_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_USEROBJ_ACCOUNTING, "org.zfsonlinux:userobj_accounting", "userobj_accounting", "User/Group object accounting.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, userobj_accounting_deps, sfeatures); } { static const spa_feature_t bookmark_v2_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_BOOKMARK_V2, "com.datto:bookmark_v2", "bookmark_v2", "Support for larger bookmarks", 0, ZFEATURE_TYPE_BOOLEAN, bookmark_v2_deps, sfeatures); } { static const spa_feature_t encryption_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_BOOKMARK_V2, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_ENCRYPTION, "com.datto:encryption", "encryption", "Support for dataset level encryption", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, encryption_deps, sfeatures); } { static const spa_feature_t project_quota_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_PROJECT_QUOTA, "org.zfsonlinux:project_quota", "project_quota", "space/object accounting based on project ID.", ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, project_quota_deps, sfeatures); } zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES, "org.zfsonlinux:allocation_classes", "allocation_classes", "Support for separate allocation classes.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_RESILVER_DEFER, "com.datto:resilver_defer", "resilver_defer", "Support for deferring new resilvers when one is already running.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_DEVICE_REBUILD, "org.openzfs:device_rebuild", "device_rebuild", "Support for sequential mirror/dRAID device rebuilds", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t zstd_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_ZSTD_COMPRESS, "org.freebsd:zstd_compress", "zstd_compress", "zstd compression algorithm support.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps, sfeatures); } zfeature_register(SPA_FEATURE_DRAID, "org.openzfs:draid", "draid", "Support for distributed spare RAID", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t zilsaxattr_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_ZILSAXATTR, "org.openzfs:zilsaxattr", "zilsaxattr", "Support for xattr=sa extended attribute logging in ZIL.", ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures); } zfeature_register(SPA_FEATURE_HEAD_ERRLOG, "com.delphix:head_errlog", "head_errlog", "Support for per-dataset on-disk error logs.", ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t blake3_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_BLAKE3, "org.openzfs:blake3", "blake3", "BLAKE3 hash algorithm.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, blake3_deps, sfeatures); } zfeature_register(SPA_FEATURE_BLOCK_CLONING, "com.fudosecurity:block_cloning", "block_cloning", "Support for block cloning via Block Reference Table.", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfeature_register(SPA_FEATURE_BLOCK_CLONING_ENDIAN, + "com.truenas:block_cloning_endian", "block_cloning_endian", + "Fixes BRT ZAP endianness on new pools.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, + sfeatures); + zfeature_register(SPA_FEATURE_AVZ_V2, "com.klarasystems:vdev_zaps_v2", "vdev_zaps_v2", "Support for root vdev ZAP.", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t redact_list_spill_deps[] = { SPA_FEATURE_REDACTION_BOOKMARKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_REDACTION_LIST_SPILL, "com.delphix:redaction_list_spill", "redaction_list_spill", "Support for increased number of redaction_snapshot " "arguments in zfs redact.", 0, ZFEATURE_TYPE_BOOLEAN, redact_list_spill_deps, sfeatures); } zfeature_register(SPA_FEATURE_RAIDZ_EXPANSION, "org.openzfs:raidz_expansion", "raidz_expansion", "Support for raidz expansion", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfeature_register(SPA_FEATURE_FAST_DEDUP, "com.klarasystems:fast_dedup", "fast_dedup", "Support for advanced deduplication", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { static const spa_feature_t longname_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LONGNAME, "org.zfsonlinux:longname", "longname", "support filename up to 1024 bytes", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, longname_deps, sfeatures); } { static const spa_feature_t large_microzap_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_LARGE_BLOCKS, SPA_FEATURE_NONE }; zfeature_register(SPA_FEATURE_LARGE_MICROZAP, "com.klarasystems:large_microzap", "large_microzap", "Support for microzaps larger than 128KB.", ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, large_microzap_deps, sfeatures); } zfeature_register(SPA_FEATURE_DYNAMIC_GANG_HEADER, "com.klarasystems:dynamic_gang_header", "dynamic_gang_header", "Support for dynamically sized gang headers", ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_NO_UPGRADE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); zfs_mod_list_supported_free(sfeatures); } #if defined(_KERNEL) EXPORT_SYMBOL(zfeature_lookup_guid); EXPORT_SYMBOL(zfeature_lookup_name); EXPORT_SYMBOL(zfeature_is_supported); EXPORT_SYMBOL(zfeature_is_valid_guid); EXPORT_SYMBOL(zfeature_depends_on); EXPORT_SYMBOL(zpool_feature_init); EXPORT_SYMBOL(spa_feature_table); #endif diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 27d9ed7ea2b0..40664354aa73 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -1,1485 +1,1527 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Block Cloning design. * * Block Cloning allows to manually clone a file (or a subset of its blocks) * into another (or the same) file by just creating additional references to * the data blocks without copying the data itself. Those references are kept * in the Block Reference Tables (BRTs). * * In many ways this is similar to the existing deduplication, but there are * some important differences: * * - Deduplication is automatic and Block Cloning is not - one has to use a * dedicated system call(s) to clone the given file/blocks. * - Deduplication keeps all data blocks in its table, even those referenced * just once. Block Cloning creates an entry in its tables only when there * are at least two references to the given data block. If the block was * never explicitly cloned or the second to last reference was dropped, * there will be neither space nor performance overhead. * - Deduplication needs data to work - one needs to pass real data to the * write(2) syscall, so hash can be calculated. Block Cloning doesn't require * data, just block pointers to the data, so it is extremely fast, as we pay * neither the cost of reading the data, nor the cost of writing the data - * we operate exclusively on metadata. * - If the D (dedup) bit is not set in the block pointer, it means that * the block is not in the dedup table (DDT) and we won't consult the DDT * when we need to free the block. Block Cloning must be consulted on every * free, because we cannot modify the source BP (eg. by setting something * similar to the D bit), thus we have no hint if the block is in the * Block Reference Table (BRT), so we need to look into the BRT. There is * an optimization in place that allows us to eliminate the majority of BRT * lookups which is described below in the "Minimizing free penalty" section. * - The BRT entry is much smaller than the DDT entry - for BRT we only store * 64bit offset and 64bit reference counter. * - Dedup keys are cryptographic hashes, so two blocks that are close to each * other on disk are most likely in totally different parts of the DDT. * The BRT entry keys are offsets into a single top-level VDEV, so data blocks * from one file should have BRT entries close to each other. * - Scrub will only do a single pass over a block that is referenced multiple * times in the DDT. Unfortunately it is not currently (if at all) possible * with Block Cloning and block referenced multiple times will be scrubbed * multiple times. The new, sorted scrub should be able to eliminate * duplicated reads given enough memory. * - Deduplication requires cryptographically strong hash as a checksum or * additional data verification. Block Cloning works with any checksum * algorithm or even with checksumming disabled. * * As mentioned above, the BRT entries are much smaller than the DDT entries. * To uniquely identify a block we just need its vdev id and offset. We also * need to maintain a reference counter. The vdev id will often repeat, as there * is a small number of top-level VDEVs and a large number of blocks stored in * each VDEV. We take advantage of that to reduce the BRT entry size further by * maintaining one BRT for each top-level VDEV, so we can then have only offset * and counter as the BRT entry. * * Minimizing free penalty. * * Block Cloning allows creating additional references to any existing block. * When we free a block there is no hint in the block pointer whether the block * was cloned or not, so on each free we have to check if there is a * corresponding entry in the BRT or not. If there is, we need to decrease * the reference counter. Doing BRT lookup on every free can potentially be * expensive by requiring additional I/Os if the BRT doesn't fit into memory. * This is the main problem with deduplication, so we've learned our lesson and * try not to repeat the same mistake here. How do we do that? We divide each * top-level VDEV into 16MB regions. For each region we maintain a counter that * is a sum of all the BRT entries that have offsets within the region. This * creates the entries count array of 16bit numbers for each top-level VDEV. * The entries count array is always kept in memory and updated on disk in the * same transaction group as the BRT updates to keep everything in-sync. We can * keep the array in memory, because it is very small. With 16MB regions and * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease * the region size even further in the future). Now, when we want to free * a block, we first consult the array. If the counter for the whole region is * zero, there is no need to look for the BRT entry, as there isn't one for * sure. If the counter for the region is greater than zero, only then we will * do a BRT lookup and if an entry is found we will decrease the reference * counter in the BRT entry and in the entry counters array. * * The entry counters array is small, but can potentially be larger for very * large VDEVs or smaller regions. In this case we don't want to rewrite entire * array on every change. We then divide the array into 32kB block and keep * a bitmap of dirty blocks within a transaction group. When we sync the * transaction group we can only update the parts of the entry counters array * that were modified. Note: Keeping track of the dirty parts of the entry * counters array is implemented, but updating only parts of the array on disk * is not yet implemented - for now we will update entire array if there was * any change. * * The implementation tries to be economic: if BRT is not used, or no longer * used, there will be no entries in the MOS and no additional memory used (eg. * the entry counters array is only allocated if needed). * * Interaction between Deduplication and Block Cloning. * * If both functionalities are in use, we could end up with a block that is * referenced multiple times in both DDT and BRT. When we free one of the * references we couldn't tell where it belongs, so we would have to decide * what table takes the precedence: do we first clear DDT references or BRT * references? To avoid this dilemma BRT cooperates with DDT - if a given block * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will * lookup DDT entry instead and increase the counter there. No BRT entry * will be created for a block which has the D (dedup) bit set. * BRT may be more efficient for manual deduplication, but if the block is * already in the DDT, then creating additional BRT entry would be less * efficient. This clever idea was proposed by Allan Jude. * * Block Cloning across datasets. * * Block Cloning is not limited to cloning blocks within the same dataset. * It is possible (and very useful) to clone blocks between different datasets. * One use case is recovering files from snapshots. By cloning the files into * dataset we need no additional storage. Without Block Cloning we would need * additional space for those files. * Another interesting use case is moving the files between datasets * (copying the file content to the new dataset and removing the source file). * In that case Block Cloning will only be used briefly, because the BRT entries * will be removed when the source is removed. * Block Cloning across encrypted datasets is supported as long as both * datasets share the same master key (e.g. snapshots and clones) * * Block Cloning flow through ZFS layers. * * Note: Block Cloning can be used both for cloning file system blocks and ZVOL * blocks. As of this writing no interface is implemented that allows for block * cloning within a ZVOL. * FreeBSD and Linux provides copy_file_range(2) system call and we will use it * for blocking cloning. * * ssize_t * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, * size_t len, unsigned int flags); * * Even though offsets and length represent bytes, they have to be * block-aligned or we will return an error so the upper layer can * fallback to the generic mechanism that will just copy the data. * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. * This function was implemented based on zfs_write(), but instead of writing * the given data we first read block pointers using the new dmu_read_l0_bps() * function from the source file. Once we have BPs from the source file we call * the dmu_brt_clone() function on the destination file. This function * allocates BPs for us. We iterate over all source BPs. If the given BP is * a hole or an embedded block, we just copy BP as-is. If it points to a real * data we place this BP on a BRT pending list using the brt_pending_add() * function. * * We use this pending list to keep track of all BPs that got new references * within this transaction group. * * Some special cases to consider and how we address them: * - The block we want to clone may have been created within the same * transaction group that we are trying to clone. Such block has no BP * allocated yet, so cannot be immediately cloned. We return EAGAIN. * - The block we want to clone may have been modified within the same * transaction group. We return EAGAIN. * - A block may be cloned multiple times during one transaction group (that's * why pending list is actually a tree and not an append-only list - this * way we can figure out faster if this block is cloned for the first time * in this txg or consecutive time). * - A block may be cloned and freed within the same transaction group * (see dbuf_undirty()). * - A block may be cloned and within the same transaction group the clone * can be cloned again (see dmu_read_l0_bps()). * - A file might have been deleted, but the caller still has a file descriptor * open to this file and clones it. * * When we free a block we have an additional step in the ZIO pipeline where we * call the zio_brt_free() function. We then call the brt_entry_decref() * that loads the corresponding BRT entry (if one exists) and decreases * reference counter. If this is not the last reference we will stop ZIO * pipeline here. If this is the last reference or the block is not in the * BRT, we continue the pipeline and free the block as usual. * * At the beginning of spa_sync() where there can be no more block cloning, * but before issuing frees we call brt_pending_apply(). This function applies * all the new clones to the BRT table - we load BRT entries and update * reference counters. To sync new BRT entries to disk, we use brt_sync() * function. This function will sync all dirty per-top-level-vdev BRTs, * the entry counters arrays, etc. * * Block Cloning and ZIL. * * Every clone operation is divided into chunks (similar to write) and each * chunk is cloned in a separate transaction. The chunk size is determined by * how many BPs we can fit into a single ZIL entry. * Replaying clone operation is different from the regular clone operation, * as when we log clone operations we cannot use the source object - it may * reside on a different dataset, so we log BPs we want to clone. * The ZIL is replayed when we mount the given dataset, not when the pool is * imported. Taking this into account it is possible that the pool is imported * without mounting datasets and the source dataset is destroyed before the * destination dataset is mounted and its ZIL replayed. * To address this situation we leverage zil_claim() mechanism where ZFS will * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE * entries, we will bump reference counters for their BPs in the BRT. Then * on mount and ZIL replay we bump the reference counters once more, while the * first references are dropped during ZIL destroy by zil_free_clone_range(). * It is possible that after zil_claim() we never mount the destination, so * we never replay its ZIL and just destroy it. In this case the only taken * references will be dropped by zil_free_clone_range(), since the cloning is * not going to ever take place. */ static kmem_cache_t *brt_entry_cache; /* * Enable/disable prefetching of BRT entries that we are going to modify. */ static int brt_zap_prefetch = 1; #ifdef ZFS_DEBUG #define BRT_DEBUG(...) do { \ if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ } \ } while (0) #else #define BRT_DEBUG(...) do { } while (0) #endif static int brt_zap_default_bs = 12; static int brt_zap_default_ibs = 12; static kstat_t *brt_ksp; typedef struct brt_stats { kstat_named_t brt_addref_entry_not_on_disk; kstat_named_t brt_addref_entry_on_disk; kstat_named_t brt_decref_entry_in_memory; kstat_named_t brt_decref_entry_loaded_from_disk; kstat_named_t brt_decref_entry_not_in_memory; kstat_named_t brt_decref_entry_read_lost_race; kstat_named_t brt_decref_entry_still_referenced; kstat_named_t brt_decref_free_data_later; kstat_named_t brt_decref_free_data_now; kstat_named_t brt_decref_no_entry; } brt_stats_t; static brt_stats_t brt_stats = { { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, { "decref_free_data_later", KSTAT_DATA_UINT64 }, { "decref_free_data_now", KSTAT_DATA_UINT64 }, { "decref_no_entry", KSTAT_DATA_UINT64 } }; struct { wmsum_t brt_addref_entry_not_on_disk; wmsum_t brt_addref_entry_on_disk; wmsum_t brt_decref_entry_in_memory; wmsum_t brt_decref_entry_loaded_from_disk; wmsum_t brt_decref_entry_not_in_memory; wmsum_t brt_decref_entry_read_lost_race; wmsum_t brt_decref_entry_still_referenced; wmsum_t brt_decref_free_data_later; wmsum_t brt_decref_free_data_now; wmsum_t brt_decref_no_entry; } brt_sums; #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) static int brt_entry_compare(const void *x1, const void *x2); static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs); static void brt_rlock(spa_t *spa) { rw_enter(&spa->spa_brt_lock, RW_READER); } static void brt_wlock(spa_t *spa) { rw_enter(&spa->spa_brt_lock, RW_WRITER); } static void brt_unlock(spa_t *spa) { rw_exit(&spa->spa_brt_lock); } static uint16_t brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) { ASSERT3U(idx, <, brtvd->bv_size); if (unlikely(brtvd->bv_need_byteswap)) { return (BSWAP_16(brtvd->bv_entcount[idx])); } else { return (brtvd->bv_entcount[idx]); } } static void brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) { ASSERT3U(idx, <, brtvd->bv_size); if (unlikely(brtvd->bv_need_byteswap)) { brtvd->bv_entcount[idx] = BSWAP_16(entcnt); } else { brtvd->bv_entcount[idx] = entcnt; } } static void brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) { uint16_t entcnt; ASSERT3U(idx, <, brtvd->bv_size); entcnt = brt_vdev_entcount_get(brtvd, idx); ASSERT(entcnt < UINT16_MAX); brt_vdev_entcount_set(brtvd, idx, entcnt + 1); } static void brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) { uint16_t entcnt; ASSERT3U(idx, <, brtvd->bv_size); entcnt = brt_vdev_entcount_get(brtvd, idx); ASSERT(entcnt > 0); brt_vdev_entcount_set(brtvd, idx, entcnt - 1); } #ifdef ZFS_DEBUG static void brt_vdev_dump(brt_vdev_t *brtvd) { uint64_t idx; uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu", (u_longlong_t)brtvd->bv_vdevid, brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, (u_longlong_t)brtvd->bv_size, (u_longlong_t)brtvd->bv_totalcount, (u_longlong_t)nblocks, (size_t)BT_SIZEOFMAP(nblocks)); if (brtvd->bv_totalcount > 0) { zfs_dbgmsg(" entcounts:"); for (idx = 0; idx < brtvd->bv_size; idx++) { uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); if (entcnt > 0) { zfs_dbgmsg(" [%04llu] %hu", (u_longlong_t)idx, entcnt); } } } if (brtvd->bv_entcount_dirty) { char *bitmap; bitmap = kmem_alloc(nblocks + 1, KM_SLEEP); for (idx = 0; idx < nblocks; idx++) { bitmap[idx] = BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; } bitmap[idx] = '\0'; zfs_dbgmsg(" dirty: %s", bitmap); kmem_free(bitmap, nblocks + 1); } } #endif static brt_vdev_t * brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc) { brt_vdev_t *brtvd = NULL; brt_rlock(spa); if (vdevid < spa->spa_brt_nvdevs) { brtvd = spa->spa_brt_vdevs[vdevid]; } else if (alloc) { /* New VDEV was added. */ brt_unlock(spa); brt_wlock(spa); if (vdevid >= spa->spa_brt_nvdevs) brt_vdevs_expand(spa, vdevid + 1); brtvd = spa->spa_brt_vdevs[vdevid]; } brt_unlock(spa); return (brtvd); } static void brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; ASSERT(brtvd->bv_initiated); ASSERT0(brtvd->bv_mos_brtvdev); ASSERT0(brtvd->bv_mos_entries); uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0, ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); VERIFY(mos_entries != 0); VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd, &brtvd->bv_mos_entries_dnode)); rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = mos_entries; rw_exit(&brtvd->bv_mos_entries_lock); BRT_DEBUG("MOS entries created, object=%llu", (u_longlong_t)brtvd->bv_mos_entries); /* * We allocate DMU buffer to store the bv_entcount[] array. * We will keep array size (bv_size) and cummulative count for all * bv_entcount[]s (bv_totalcount) in the bonus buffer. */ brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset, DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); VERIFY(brtvd->bv_mos_brtvdev != 0); BRT_DEBUG("MOS BRT VDEV created, object=%llu", (u_longlong_t)brtvd->bv_mos_brtvdev); snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); BRT_DEBUG("Pool directory object created, object=%s", name); + /* + * Activate the endian-fixed feature if this is the first BRT ZAP + * (i.e., BLOCK_CLONING is not yet active) and the feature is enabled. + */ + if (spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN) && + !spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); + } else if (spa_feature_is_active(spa, + SPA_FEATURE_BLOCK_CLONING_ENDIAN)) { + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); + } + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx); } static void brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd) { vdev_t *vd; uint16_t *entcount; ulong_t *bitmap; uint64_t nblocks, onblocks, size; ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, brtvd->bv_vdevid); size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1; spa_config_exit(spa, SCL_VDEV, FTAG); entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); if (!brtvd->bv_initiated) { ASSERT0(brtvd->bv_size); ASSERT0P(brtvd->bv_entcount); ASSERT0P(brtvd->bv_bitmap); } else { ASSERT(brtvd->bv_size > 0); ASSERT(brtvd->bv_entcount != NULL); ASSERT(brtvd->bv_bitmap != NULL); /* * TODO: Allow vdev shrinking. We only need to implement * shrinking the on-disk BRT VDEV object. * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, * offset, size, tx); */ ASSERT3U(brtvd->bv_size, <=, size); memcpy(entcount, brtvd->bv_entcount, sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); vmem_free(brtvd->bv_entcount, sizeof (entcount[0]) * brtvd->bv_size); onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), BT_SIZEOFMAP(onblocks))); kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks)); } brtvd->bv_size = size; brtvd->bv_entcount = entcount; brtvd->bv_bitmap = bitmap; if (!brtvd->bv_initiated) { brtvd->bv_need_byteswap = FALSE; brtvd->bv_initiated = TRUE; BRT_DEBUG("BRT VDEV %llu initiated.", (u_longlong_t)brtvd->bv_vdevid); } } static int brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd) { dmu_buf_t *db; brt_vdev_phys_t *bvphys; int error; ASSERT(!brtvd->bv_initiated); ASSERT(brtvd->bv_mos_brtvdev != 0); error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, FTAG, &db); if (error != 0) return (error); bvphys = db->db_data; if (spa->spa_brt_rangesize == 0) { spa->spa_brt_rangesize = bvphys->bvp_rangesize; } else { ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize); } brt_vdev_realloc(spa, brtvd); /* TODO: We don't support VDEV shrinking. */ ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); /* * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. */ error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), brtvd->bv_entcount, DMU_READ_NO_PREFETCH); if (error != 0) return (error); ASSERT(bvphys->bvp_mos_entries != 0); VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd, &brtvd->bv_mos_entries_dnode)); rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = bvphys->bvp_mos_entries; rw_exit(&brtvd->bv_mos_entries_lock); brtvd->bv_need_byteswap = (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); brtvd->bv_totalcount = bvphys->bvp_totalcount; brtvd->bv_usedspace = bvphys->bvp_usedspace; brtvd->bv_savedspace = bvphys->bvp_savedspace; dmu_buf_rele(db, FTAG); BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu", (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)brtvd->bv_mos_brtvdev, (u_longlong_t)brtvd->bv_mos_entries); return (0); } static void brt_vdev_dealloc(brt_vdev_t *brtvd) { ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); ASSERT(brtvd->bv_initiated); ASSERT0(avl_numnodes(&brtvd->bv_tree)); vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); brtvd->bv_entcount = NULL; uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks)); brtvd->bv_bitmap = NULL; brtvd->bv_size = 0; brtvd->bv_initiated = FALSE; BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); } static void brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; uint64_t count; ASSERT(brtvd->bv_initiated); ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(brtvd->bv_mos_entries != 0); ASSERT0(brtvd->bv_totalcount); ASSERT0(brtvd->bv_usedspace); ASSERT0(brtvd->bv_savedspace); uint64_t mos_entries = brtvd->bv_mos_entries; rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = 0; rw_exit(&brtvd->bv_mos_entries_lock); dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); brtvd->bv_mos_entries_dnode = NULL; ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count)); ASSERT0(count); VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx)); BRT_DEBUG("MOS entries destroyed, object=%llu", (u_longlong_t)mos_entries); VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, tx)); BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", (u_longlong_t)brtvd->bv_mos_brtvdev); brtvd->bv_mos_brtvdev = 0; brtvd->bv_entcount_dirty = FALSE; snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, tx)); BRT_DEBUG("Pool directory object removed, object=%s", name); brtvd->bv_meta_dirty = FALSE; rw_enter(&brtvd->bv_lock, RW_WRITER); brt_vdev_dealloc(brtvd); rw_exit(&brtvd->bv_lock); spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx); + if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN)) + spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); } static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs) { brt_vdev_t **vdevs; ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock)); ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs); if (nvdevs == spa->spa_brt_nvdevs) return; vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP); if (spa->spa_brt_nvdevs > 0) { ASSERT(spa->spa_brt_vdevs != NULL); memcpy(vdevs, spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); } spa->spa_brt_vdevs = vdevs; for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) { brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP); rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL); brtvd->bv_vdevid = vdevid; brtvd->bv_initiated = FALSE; rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL); avl_create(&brtvd->bv_tree, brt_entry_compare, sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); for (int i = 0; i < TXG_SIZE; i++) { avl_create(&brtvd->bv_pending_tree[i], brt_entry_compare, sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); } mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL); spa->spa_brt_vdevs[vdevid] = brtvd; } BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs); spa->spa_brt_nvdevs = nvdevs; } static boolean_t brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset) { uint64_t idx = offset / spa->spa_brt_rangesize; if (idx < brtvd->bv_size) { /* VDEV wasn't expanded. */ return (brt_vdev_entcount_get(brtvd, idx) > 0); } return (FALSE); } static void brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, uint64_t dsize, uint64_t count) { uint64_t idx; ASSERT(brtvd->bv_initiated); brtvd->bv_savedspace += dsize * count; brtvd->bv_meta_dirty = TRUE; if (bre->bre_count > 0) return; brtvd->bv_usedspace += dsize; idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; if (idx >= brtvd->bv_size) { /* VDEV has been expanded. */ rw_enter(&brtvd->bv_lock, RW_WRITER); brt_vdev_realloc(spa, brtvd); rw_exit(&brtvd->bv_lock); } ASSERT3U(idx, <, brtvd->bv_size); brtvd->bv_totalcount++; brt_vdev_entcount_inc(brtvd, idx); brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); } static void brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, uint64_t dsize) { uint64_t idx; ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); ASSERT(brtvd->bv_initiated); brtvd->bv_savedspace -= dsize; brtvd->bv_meta_dirty = TRUE; if (bre->bre_count > 0) return; brtvd->bv_usedspace -= dsize; idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; ASSERT3U(idx, <, brtvd->bv_size); ASSERT(brtvd->bv_totalcount > 0); brtvd->bv_totalcount--; brt_vdev_entcount_dec(brtvd, idx); brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); } static void brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { dmu_buf_t *db; brt_vdev_phys_t *bvphys; ASSERT(brtvd->bv_meta_dirty); ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, FTAG, &db)); if (brtvd->bv_entcount_dirty) { /* * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. */ dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), brtvd->bv_entcount, tx); uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks)); brtvd->bv_entcount_dirty = FALSE; } dmu_buf_will_dirty(db, tx); bvphys = db->db_data; bvphys->bvp_mos_entries = brtvd->bv_mos_entries; bvphys->bvp_size = brtvd->bv_size; if (brtvd->bv_need_byteswap) { bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; } else { bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; } bvphys->bvp_totalcount = brtvd->bv_totalcount; bvphys->bvp_rangesize = spa->spa_brt_rangesize; bvphys->bvp_usedspace = brtvd->bv_usedspace; bvphys->bvp_savedspace = brtvd->bv_savedspace; dmu_buf_rele(db, FTAG); brtvd->bv_meta_dirty = FALSE; } static void brt_vdevs_free(spa_t *spa) { if (spa->spa_brt_vdevs == 0) return; for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; rw_enter(&brtvd->bv_lock, RW_WRITER); if (brtvd->bv_initiated) brt_vdev_dealloc(brtvd); rw_exit(&brtvd->bv_lock); rw_destroy(&brtvd->bv_lock); if (brtvd->bv_mos_entries != 0) dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); rw_destroy(&brtvd->bv_mos_entries_lock); avl_destroy(&brtvd->bv_tree); for (int i = 0; i < TXG_SIZE; i++) avl_destroy(&brtvd->bv_pending_tree[i]); mutex_destroy(&brtvd->bv_pending_lock); kmem_free(brtvd, sizeof (*brtvd)); } kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); } static void brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) { bre->bre_bp = *bp; bre->bre_count = 0; bre->bre_pcount = 0; *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); } +static boolean_t +brt_has_endian_fixed(spa_t *spa) +{ + return (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN)); +} + static int -brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre) +brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre) { uint64_t off = BRE_OFFSET(bre); if (brtvd->bv_mos_entries == 0) return (SET_ERROR(ENOENT)); - return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, - &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count)); + if (brt_has_endian_fixed(spa)) { + return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &off, BRT_KEY_WORDS, sizeof (bre->bre_count), 1, + &bre->bre_count)); + } else { + return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count)); + } } /* * Return TRUE if we _can_ have BRT entry for this bp. It might be false * positive, but gives us quick answer if we should look into BRT, which * may require reads and thus will be more expensive. */ boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp) { if (spa->spa_brt_nvdevs == 0) return (B_FALSE); uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); if (brtvd == NULL || !brtvd->bv_initiated) return (FALSE); /* * We don't need locks here, since bv_entcount pointer must be * stable at this point, and we don't care about false positive * races here, while false negative should be impossible, since * all brt_vdev_addref() have already completed by this point. */ uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); return (brt_vdev_lookup(spa, brtvd, off)); } uint64_t brt_get_dspace(spa_t *spa) { if (spa->spa_brt_nvdevs == 0) return (0); brt_rlock(spa); uint64_t s = 0; for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) s += spa->spa_brt_vdevs[vdevid]->bv_savedspace; brt_unlock(spa); return (s); } uint64_t brt_get_used(spa_t *spa) { if (spa->spa_brt_nvdevs == 0) return (0); brt_rlock(spa); uint64_t s = 0; for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) s += spa->spa_brt_vdevs[vdevid]->bv_usedspace; brt_unlock(spa); return (s); } uint64_t brt_get_saved(spa_t *spa) { return (brt_get_dspace(spa)); } uint64_t brt_get_ratio(spa_t *spa) { uint64_t used = brt_get_used(spa); if (used == 0) return (100); return ((used + brt_get_saved(spa)) * 100 / used); } static int brt_kstats_update(kstat_t *ksp, int rw) { brt_stats_t *bs = ksp->ks_data; if (rw == KSTAT_WRITE) return (EACCES); bs->brt_addref_entry_not_on_disk.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); bs->brt_addref_entry_on_disk.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_on_disk); bs->brt_decref_entry_in_memory.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_in_memory); bs->brt_decref_entry_loaded_from_disk.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); bs->brt_decref_entry_not_in_memory.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); bs->brt_decref_entry_read_lost_race.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); bs->brt_decref_entry_still_referenced.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_still_referenced); bs->brt_decref_free_data_later.value.ui64 = wmsum_value(&brt_sums.brt_decref_free_data_later); bs->brt_decref_free_data_now.value.ui64 = wmsum_value(&brt_sums.brt_decref_free_data_now); bs->brt_decref_no_entry.value.ui64 = wmsum_value(&brt_sums.brt_decref_no_entry); return (0); } static void brt_stat_init(void) { wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); wmsum_init(&brt_sums.brt_decref_free_data_later, 0); wmsum_init(&brt_sums.brt_decref_free_data_now, 0); wmsum_init(&brt_sums.brt_decref_no_entry, 0); brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (brt_ksp != NULL) { brt_ksp->ks_data = &brt_stats; brt_ksp->ks_update = brt_kstats_update; kstat_install(brt_ksp); } } static void brt_stat_fini(void) { if (brt_ksp != NULL) { kstat_delete(brt_ksp); brt_ksp = NULL; } wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); wmsum_fini(&brt_sums.brt_addref_entry_on_disk); wmsum_fini(&brt_sums.brt_decref_entry_in_memory); wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); wmsum_fini(&brt_sums.brt_decref_free_data_later); wmsum_fini(&brt_sums.brt_decref_free_data_now); wmsum_fini(&brt_sums.brt_decref_no_entry); } void brt_init(void) { brt_entry_cache = kmem_cache_create("brt_entry_cache", sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); brt_stat_init(); } void brt_fini(void) { brt_stat_fini(); kmem_cache_destroy(brt_entry_cache); } /* Return TRUE if block should be freed immediately. */ boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp) { brt_entry_t *bre, *racebre; brt_entry_t bre_search; avl_index_t where; uint64_t vdevid; int error; brt_entry_fill(bp, &bre_search, &vdevid); brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); rw_enter(&brtvd->bv_lock, RW_WRITER); ASSERT(brtvd->bv_initiated); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre != NULL) { BRTSTAT_BUMP(brt_decref_entry_in_memory); goto out; } else { BRTSTAT_BUMP(brt_decref_entry_not_in_memory); } rw_exit(&brtvd->bv_lock); - error = brt_entry_lookup(brtvd, &bre_search); + error = brt_entry_lookup(spa, brtvd, &bre_search); /* bre_search now contains correct bre_count */ if (error == ENOENT) { BRTSTAT_BUMP(brt_decref_no_entry); return (B_TRUE); } ASSERT0(error); rw_enter(&brtvd->bv_lock, RW_WRITER); racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); if (racebre != NULL) { /* The entry was added when the lock was dropped. */ BRTSTAT_BUMP(brt_decref_entry_read_lost_race); bre = racebre; goto out; } BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); bre->bre_bp = bre_search.bre_bp; bre->bre_count = bre_search.bre_count; bre->bre_pcount = 0; avl_insert(&brtvd->bv_tree, bre, where); out: if (bre->bre_count == 0) { rw_exit(&brtvd->bv_lock); BRTSTAT_BUMP(brt_decref_free_data_now); return (B_TRUE); } bre->bre_pcount--; ASSERT(bre->bre_count > 0); bre->bre_count--; if (bre->bre_count == 0) BRTSTAT_BUMP(brt_decref_free_data_later); else BRTSTAT_BUMP(brt_decref_entry_still_referenced); brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp)); rw_exit(&brtvd->bv_lock); return (B_FALSE); } uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) { brt_entry_t bre_search, *bre; uint64_t vdevid, refcnt; int error; brt_entry_fill(bp, &bre_search, &vdevid); brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); rw_enter(&brtvd->bv_lock, RW_READER); ASSERT(brtvd->bv_initiated); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre == NULL) { rw_exit(&brtvd->bv_lock); - error = brt_entry_lookup(brtvd, &bre_search); + error = brt_entry_lookup(spa, brtvd, &bre_search); if (error == ENOENT) { refcnt = 0; } else { ASSERT0(error); refcnt = bre_search.bre_count; } } else { refcnt = bre->bre_count; rw_exit(&brtvd->bv_lock); } return (refcnt); } static void brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp) { if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0) return; uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); rw_enter(&brtvd->bv_mos_entries_lock, RW_READER); if (brtvd->bv_mos_entries != 0) { (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode, &off, BRT_KEY_WORDS); } rw_exit(&brtvd->bv_mos_entries_lock); } static int brt_entry_compare(const void *x1, const void *x2) { const brt_entry_t *bre1 = x1, *bre2 = x2; const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp; return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), DVA_GET_OFFSET(&bp2->blk_dva[0]))); } void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { brt_entry_t *bre, *newbre; avl_index_t where; uint64_t txg; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE); avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); newbre->bre_bp = *bp; newbre->bre_count = 0; newbre->bre_pcount = 1; mutex_enter(&brtvd->bv_pending_lock); bre = avl_find(pending_tree, newbre, &where); if (bre == NULL) { avl_insert(pending_tree, newbre, where); newbre = NULL; } else { bre->bre_pcount++; } mutex_exit(&brtvd->bv_pending_lock); if (newbre != NULL) { ASSERT(bre != NULL); ASSERT(bre != newbre); kmem_cache_free(brt_entry_cache, newbre); } else { ASSERT0P(bre); /* Prefetch BRT entry for the syncing context. */ brt_prefetch(brtvd, bp); } } void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { brt_entry_t *bre, bre_search; uint64_t txg; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; bre_search.bre_bp = *bp; mutex_enter(&brtvd->bv_pending_lock); bre = avl_find(pending_tree, &bre_search, NULL); ASSERT(bre != NULL); ASSERT(bre->bre_pcount > 0); bre->bre_pcount--; if (bre->bre_pcount == 0) avl_remove(pending_tree, bre); else bre = NULL; mutex_exit(&brtvd->bv_pending_lock); if (bre) kmem_cache_free(brt_entry_cache, bre); } static void brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg) { brt_entry_t *bre, *nbre; /* * We are in syncing context, so no other bv_pending_tree accesses * are possible for the TXG. So we don't need bv_pending_lock. */ ASSERT(avl_is_empty(&brtvd->bv_tree)); avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]); for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) { nbre = AVL_NEXT(&brtvd->bv_tree, bre); /* * If the block has DEDUP bit set, it means that it * already exists in the DEDUP table, so we can just * use that instead of creating new entry in the BRT. */ if (BP_GET_DEDUP(&bre->bre_bp)) { while (bre->bre_pcount > 0) { if (!ddt_addref(spa, &bre->bre_bp)) break; bre->bre_pcount--; } if (bre->bre_pcount == 0) { avl_remove(&brtvd->bv_tree, bre); kmem_cache_free(brt_entry_cache, bre); continue; } } /* * Unless we know that the block is definitely not in ZAP, * try to get its reference count from there. */ uint64_t off = BRE_OFFSET(bre); if (brtvd->bv_mos_entries != 0 && brt_vdev_lookup(spa, brtvd, off)) { - int error = zap_lookup_uint64_by_dnode( - brtvd->bv_mos_entries_dnode, &off, - BRT_KEY_WORDS, 1, sizeof (bre->bre_count), - &bre->bre_count); + int error; + if (brt_has_endian_fixed(spa)) { + error = zap_lookup_uint64_by_dnode( + brtvd->bv_mos_entries_dnode, &off, + BRT_KEY_WORDS, sizeof (bre->bre_count), 1, + &bre->bre_count); + } else { + error = zap_lookup_uint64_by_dnode( + brtvd->bv_mos_entries_dnode, &off, + BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count); + } if (error == 0) { BRTSTAT_BUMP(brt_addref_entry_on_disk); } else { ASSERT3U(error, ==, ENOENT); BRTSTAT_BUMP(brt_addref_entry_not_on_disk); } } } /* * If all the cloned blocks we had were handled by DDT, we don't need * to initiate the vdev. */ if (avl_is_empty(&brtvd->bv_tree)) return; if (!brtvd->bv_initiated) { rw_enter(&brtvd->bv_lock, RW_WRITER); brt_vdev_realloc(spa, brtvd); rw_exit(&brtvd->bv_lock); } /* * Convert pending references into proper ones. This has to be a * separate loop, since entcount modifications would cause false * positives for brt_vdev_lookup() on following iterations. */ for (bre = avl_first(&brtvd->bv_tree); bre; bre = AVL_NEXT(&brtvd->bv_tree, bre)) { brt_vdev_addref(spa, brtvd, bre, bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount); bre->bre_count += bre->bre_pcount; } } void brt_pending_apply(spa_t *spa, uint64_t txg) { brt_rlock(spa); for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; brt_unlock(spa); brt_pending_apply_vdev(spa, brtvd, txg); brt_rlock(spa); } brt_unlock(spa); } static void -brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) +brt_sync_entry(spa_t *spa, dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) { uint64_t off = BRE_OFFSET(bre); if (bre->bre_pcount == 0) { /* The net change is zero, nothing to do in ZAP. */ } else if (bre->bre_count == 0) { int error = zap_remove_uint64_by_dnode(dn, &off, BRT_KEY_WORDS, tx); VERIFY(error == 0 || error == ENOENT); } else { - VERIFY0(zap_update_uint64_by_dnode(dn, &off, - BRT_KEY_WORDS, 1, sizeof (bre->bre_count), - &bre->bre_count, tx)); + if (brt_has_endian_fixed(spa)) { + VERIFY0(zap_update_uint64_by_dnode(dn, &off, + BRT_KEY_WORDS, sizeof (bre->bre_count), 1, + &bre->bre_count, tx)); + } else { + VERIFY0(zap_update_uint64_by_dnode(dn, &off, + BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count, tx)); + } } } static void brt_sync_table(spa_t *spa, dmu_tx_t *tx) { brt_entry_t *bre; brt_rlock(spa); for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; brt_unlock(spa); if (!brtvd->bv_meta_dirty) { ASSERT(!brtvd->bv_entcount_dirty); ASSERT0(avl_numnodes(&brtvd->bv_tree)); brt_rlock(spa); continue; } ASSERT(!brtvd->bv_entcount_dirty || avl_numnodes(&brtvd->bv_tree) != 0); if (brtvd->bv_mos_brtvdev == 0) brt_vdev_create(spa, brtvd, tx); void *c = NULL; while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { - brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx); + brt_sync_entry(spa, brtvd->bv_mos_entries_dnode, bre, + tx); kmem_cache_free(brt_entry_cache, bre); } #ifdef ZFS_DEBUG if (zfs_flags & ZFS_DEBUG_BRT) brt_vdev_dump(brtvd); #endif if (brtvd->bv_totalcount == 0) brt_vdev_destroy(spa, brtvd, tx); else brt_vdev_sync(spa, brtvd, tx); brt_rlock(spa); } brt_unlock(spa); } void brt_sync(spa_t *spa, uint64_t txg) { dmu_tx_t *tx; uint64_t vdevid; ASSERT3U(spa_syncing_txg(spa), ==, txg); brt_rlock(spa); for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty) break; } if (vdevid >= spa->spa_brt_nvdevs) { brt_unlock(spa); return; } brt_unlock(spa); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); brt_sync_table(spa, tx); dmu_tx_commit(tx); } static void brt_alloc(spa_t *spa) { rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL); spa->spa_brt_vdevs = NULL; spa->spa_brt_nvdevs = 0; spa->spa_brt_rangesize = 0; } void brt_create(spa_t *spa) { brt_alloc(spa); spa->spa_brt_rangesize = BRT_RANGESIZE; } int brt_load(spa_t *spa) { int error = 0; brt_alloc(spa); brt_wlock(spa); for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children; vdevid++) { char name[64]; uint64_t mos_brtvdev; /* Look if this vdev had active block cloning. */ snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)vdevid); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &mos_brtvdev); if (error == ENOENT) { error = 0; continue; } if (error != 0) break; /* If it did, then allocate them all and load this one. */ brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children); brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; rw_enter(&brtvd->bv_lock, RW_WRITER); brtvd->bv_mos_brtvdev = mos_brtvdev; error = brt_vdev_load(spa, brtvd); rw_exit(&brtvd->bv_lock); if (error != 0) break; } if (spa->spa_brt_rangesize == 0) spa->spa_brt_rangesize = BRT_RANGESIZE; brt_unlock(spa); return (error); } void brt_unload(spa_t *spa) { if (spa->spa_brt_rangesize == 0) return; brt_vdevs_free(spa); rw_destroy(&spa->spa_brt_lock); spa->spa_brt_rangesize = 0; } ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, "Enable prefetching of BRT ZAP entries"); ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, "BRT ZAP leaf blockshift"); ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, "BRT ZAP indirect blockshift"); diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 6de0869765ad..3389dcf72f89 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -1,119 +1,120 @@ # SPDX-License-Identifier: CDDL-1.0 # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # # Copyright (c) 2013, 2014 by Delphix. All rights reserved. # Copyright 2016 Nexenta Systems, Inc. All rights reserved. # # Set the expected properties of zpool typeset -a properties=( "size" "capacity" "altroot" "health" "guid" "load_guid" "version" "bootfs" "delegation" "autoreplace" "cachefile" "checkpoint" "failmode" "listsnapshots" "autoexpand" "dedupratio" "dedup_table_quota" "dedup_table_size" "free" "allocated" "readonly" "comment" "expandsize" "freeing" "fragmentation" "leaked" "multihost" "autotrim" "compatibility" "bcloneused" "bclonesaved" "bcloneratio" "last_scrubbed_txg" "feature@async_destroy" "feature@empty_bpobj" "feature@lz4_compress" "feature@multi_vdev_crash_dump" "feature@spacemap_histogram" "feature@enabled_txg" "feature@hole_birth" "feature@extensible_dataset" "feature@embedded_data" "feature@bookmarks" "feature@filesystem_limits" "feature@large_blocks" "feature@sha512" "feature@skein" "feature@edonr" "feature@device_removal" "feature@obsolete_counts" "feature@zpool_checkpoint" "feature@spacemap_v2" "feature@redaction_bookmarks" "feature@redacted_datasets" "feature@bookmark_written" "feature@log_spacemap" "feature@device_rebuild" "feature@draid" "feature@redaction_list_spill" "feature@dynamic_gang_header" ) if is_linux || is_freebsd; then properties+=( "ashift" "feature@large_dnode" "feature@userobj_accounting" "feature@encryption" "feature@project_quota" "feature@allocation_classes" "feature@resilver_defer" "feature@bookmark_v2" "feature@livelist" "feature@zstd_compress" "feature@zilsaxattr" "feature@head_errlog" "feature@blake3" "feature@block_cloning" "feature@vdev_zaps_v2" "feature@raidz_expansion" "feature@fast_dedup" "feature@longname" "feature@large_microzap" + "feature@block_cloning_endian" ) fi