diff --git a/sys/contrib/openzfs/config/kernel-inode-times.m4 b/sys/contrib/openzfs/config/kernel-inode-times.m4 index 412e13b47df5..aae95abf1720 100644 --- a/sys/contrib/openzfs/config/kernel-inode-times.m4 +++ b/sys/contrib/openzfs/config/kernel-inode-times.m4 @@ -1,93 +1,93 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [ dnl # dnl # 5.6 API change dnl # timespec64_trunc() replaced by timestamp_truncate() interface. dnl # ZFS_LINUX_TEST_SRC([timestamp_truncate], [ #include ],[ struct timespec64 ts; struct inode ip; memset(&ts, 0, sizeof(ts)); ts = timestamp_truncate(ts, &ip); ]) dnl # dnl # 4.18 API change dnl # i_atime, i_mtime, and i_ctime changed from timespec to timespec64. dnl # ZFS_LINUX_TEST_SRC([inode_times], [ #include ],[ struct inode ip; struct timespec ts; memset(&ip, 0, sizeof(ip)); ts = ip.i_mtime; ]) dnl # dnl # 6.6 API change dnl # i_ctime no longer directly accessible, must use dnl # inode_get_ctime(ip), inode_set_ctime*(ip) to dnl # read/write. dnl # ZFS_LINUX_TEST_SRC([inode_get_ctime], [ #include ],[ struct inode ip; memset(&ip, 0, sizeof(ip)); inode_get_ctime(&ip); ]) ZFS_LINUX_TEST_SRC([inode_set_ctime_to_ts], [ #include ],[ struct inode ip; - struct timespec64 ts; + struct timespec64 ts = {0}; memset(&ip, 0, sizeof(ip)); inode_set_ctime_to_ts(&ip, ts); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ AC_MSG_CHECKING([whether timestamp_truncate() exists]) ZFS_LINUX_TEST_RESULT([timestamp_truncate], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_TIMESTAMP_TRUNCATE, 1, [timestamp_truncate() exists]) ],[ AC_MSG_RESULT(no) ]) AC_MSG_CHECKING([whether inode->i_*time's are timespec64]) ZFS_LINUX_TEST_RESULT([inode_times], [ AC_MSG_RESULT(no) ],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_TIMESPEC64_TIMES, 1, [inode->i_*time's are timespec64]) ]) AC_MSG_CHECKING([whether inode_get_ctime() exists]) ZFS_LINUX_TEST_RESULT([inode_get_ctime], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_GET_CTIME, 1, [inode_get_ctime() exists in linux/fs.h]) ],[ AC_MSG_RESULT(no) ]) AC_MSG_CHECKING([whether inode_set_ctime_to_ts() exists]) ZFS_LINUX_TEST_RESULT([inode_set_ctime_to_ts], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_SET_CTIME_TO_TS, 1, [inode_set_ctime_to_ts() exists in linux/fs.h]) ],[ AC_MSG_RESULT(no) ]) ]) diff --git a/sys/contrib/openzfs/include/sys/dmu.h b/sys/contrib/openzfs/include/sys/dmu.h index bddf395df7ee..921f51f27a20 100644 --- a/sys/contrib/openzfs/include/sys/dmu.h +++ b/sys/contrib/openzfs/include/sys/dmu.h @@ -1,1116 +1,1115 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_H #define _SYS_DMU_H /* * This file describes the interface that the DMU provides for its * consumers. * * The DMU also interacts with the SPA. That interface is described in * dmu_spa.h. */ #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct page; struct vnode; struct spa; struct zilog; struct zio; struct blkptr; struct zap_cursor; struct dsl_dataset; struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; struct zbookmark_phys; struct spa; struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; struct dsl_crypto_params; struct locked_range; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; typedef struct dnode dnode_t; typedef enum dmu_object_byteswap { DMU_BSWAP_UINT8, DMU_BSWAP_UINT16, DMU_BSWAP_UINT32, DMU_BSWAP_UINT64, DMU_BSWAP_ZAP, DMU_BSWAP_DNODE, DMU_BSWAP_OBJSET, DMU_BSWAP_ZNODE, DMU_BSWAP_OLDACL, DMU_BSWAP_ACL, /* * Allocating a new byteswap type number makes the on-disk format * incompatible with any other format that uses the same number. * * Data can usually be structured to work with one of the * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. */ DMU_BSWAP_NUMFUNCS } dmu_object_byteswap_t; #define DMU_OT_NEWTYPE 0x80 #define DMU_OT_METADATA 0x40 #define DMU_OT_ENCRYPTED 0x20 #define DMU_OT_BYTESWAP_MASK 0x1f /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data * (dmu_object_byteswap_t). All of the types created by this method * are cached in the dbuf metadata cache. */ #define DMU_OT(byteswap, metadata, encrypted) \ (DMU_OT_NEWTYPE | \ ((metadata) ? DMU_OT_METADATA : 0) | \ ((encrypted) ? DMU_OT_ENCRYPTED : 0) | \ ((byteswap) & DMU_OT_BYTESWAP_MASK)) #define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ (ot) < DMU_OT_NUMTYPES) #define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) /* * MDB doesn't have dmu_ot; it defines these macros itself. */ #ifndef ZFS_MDB #define DMU_OT_IS_METADATA_IMPL(ot) (dmu_ot[ot].ot_metadata) #define DMU_OT_IS_ENCRYPTED_IMPL(ot) (dmu_ot[ot].ot_encrypt) #define DMU_OT_BYTESWAP_IMPL(ot) (dmu_ot[ot].ot_byteswap) #endif #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ (((ot) & DMU_OT_METADATA) != 0) : \ DMU_OT_IS_METADATA_IMPL(ot)) #define DMU_OT_IS_DDT(ot) \ ((ot) == DMU_OT_DDT_ZAP) #define DMU_OT_IS_CRITICAL(ot) \ (DMU_OT_IS_METADATA(ot) && \ (ot) != DMU_OT_DNODE && \ (ot) != DMU_OT_DIRECTORY_CONTENTS && \ (ot) != DMU_OT_SA) /* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */ #define DMU_OT_IS_FILE(ot) \ ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) #define DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ (((ot) & DMU_OT_ENCRYPTED) != 0) : \ DMU_OT_IS_ENCRYPTED_IMPL(ot)) /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill * is repurposed for embedded BPs. */ #define DMU_OT_HAS_FILL(ot) \ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ DMU_OT_BYTESWAP_IMPL(ot)) typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ DMU_OT_OBJECT_DIRECTORY, /* ZAP */ DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ DMU_OT_BPOBJ, /* UINT64 */ DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ /* zil: */ DMU_OT_INTENT_LOG, /* UINT64 */ /* dmu: */ DMU_OT_DNODE, /* DNODE */ DMU_OT_OBJSET, /* OBJSET */ /* dsl: */ DMU_OT_DSL_DIR, /* UINT64 */ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ DMU_OT_DSL_PROPS, /* ZAP */ DMU_OT_DSL_DATASET, /* UINT64 */ /* zpl: */ DMU_OT_ZNODE, /* ZNODE */ DMU_OT_OLDACL, /* Old ACL */ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ DMU_OT_MASTER_NODE, /* ZAP */ DMU_OT_UNLINKED_SET, /* ZAP */ /* zvol: */ DMU_OT_ZVOL, /* UINT8 */ DMU_OT_ZVOL_PROP, /* ZAP */ /* other; for testing only! */ DMU_OT_PLAIN_OTHER, /* UINT8 */ DMU_OT_UINT64_OTHER, /* UINT64 */ DMU_OT_ZAP_OTHER, /* ZAP */ /* new object types: */ DMU_OT_ERROR_LOG, /* ZAP */ DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ DMU_OT_DSL_PERMS, /* ZAP */ DMU_OT_ACL, /* ACL */ DMU_OT_SYSACL, /* SYSACL */ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ DMU_OT_DDT_ZAP, /* ZAP */ DMU_OT_DDT_STATS, /* ZAP */ DMU_OT_SA, /* System attr */ DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_SCAN_XLATE, /* ZAP */ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_DEADLIST, /* ZAP */ DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ /* * Do not allocate new object types here. Doing so makes the on-disk * format incompatible with any other format that uses the same object * type number. * * When creating an object which does not have one of the above types * use the DMU_OTN_* type with the correct byteswap and metadata * values. * * The DMU_OTN_* types do not have entries in the dmu_ot table, * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead * of indexing into dmu_ot directly (this works for both DMU_OT_* types * and DMU_OTN_* types). */ DMU_OT_NUMTYPES, /* * Names for valid types declared with DMU_OT(). */ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_FALSE), DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_FALSE), DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_FALSE), DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_FALSE), DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_FALSE), DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_FALSE), DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_FALSE), DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_FALSE), DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_FALSE), DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_FALSE), DMU_OTN_UINT8_ENC_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_TRUE), DMU_OTN_UINT8_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_TRUE), DMU_OTN_UINT16_ENC_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_TRUE), DMU_OTN_UINT16_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_TRUE), DMU_OTN_UINT32_ENC_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_TRUE), DMU_OTN_UINT32_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_TRUE), DMU_OTN_UINT64_ENC_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_TRUE), DMU_OTN_UINT64_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_TRUE), DMU_OTN_ZAP_ENC_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_TRUE), DMU_OTN_ZAP_ENC_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_TRUE), } dmu_object_type_t; /* * These flags are intended to be used to specify the "txg_how" * parameter when calling the dmu_tx_assign() function. See the comment * above dmu_tx_assign() for more details on the meaning of these flags. */ #define TXG_NOWAIT (0ULL) #define TXG_WAIT (1ULL<<0) #define TXG_NOTHROTTLE (1ULL<<1) void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); void byteswap_uint8_array(void *buf, size_t size); void zap_byteswap(void *buf, size_t size); void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) #define DS_FIND_SERIALIZE (1<<2) /* * The maximum number of bytes that can be accessed as part of one * operation, including metadata. */ #define DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) #define DMU_PROJECTUSED_OBJECT (-3ULL) /* * Zap prefix for object accounting in DMU_{USER,GROUP,PROJECT}USED_OBJECT. */ #define DMU_OBJACCT_PREFIX "obj-" #define DMU_OBJACCT_PREFIX_LEN 4 /* * artificial blkids for bonus buffer and spill blocks */ #define DMU_BONUS_BLKID (-1ULL) #define DMU_SPILL_BLKID (-2ULL) /* * Public routines to create, destroy, open, and close objsets. */ typedef void dmu_objset_create_sync_func_t(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); int dmu_objset_hold(const char *name, const void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, boolean_t key_required, const void *tag, objset_t **osp); void dmu_objset_rele(objset_t *os, const void *tag); void dmu_objset_disown(objset_t *os, boolean_t key_required, const void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, struct dsl_crypto_params *dcp, dmu_objset_create_sync_func_t func, void *arg); int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); int dmu_objset_find(const char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ void *db_data; /* data in buffer */ } dmu_buf_t; /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" #define DMU_POOL_FEATURES_FOR_READ "features_for_read" #define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_ERRORSCRUB "error_scrub" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" #define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" #define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map" #define DMU_POOL_REMOVING "com.delphix:removing" #define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" #define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap" #define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones" /* * Allocate an object from this objset. The range of object numbers * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. * * The transaction must be assigned to a txg. The newly allocated * object will be "held" in the transaction (ie. you can modify the * newly allocated object in this transaction). * * dmu_object_alloc() chooses an object and returns it in *objectp. * * dmu_object_claim() allocates a specific object number. If that * number is already allocated, it fails and returns EEXIST. * * Return 0 on success, or ENOSPC or EEXIST as specified above. */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, int dnodesize, dmu_tx_t *tx); uint64_t dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, int dnodesize, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *tx); int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Free an object from this objset. * * The object's data will be freed as well (ie. you don't need to call * dmu_free(object, 0, -1, tx)). * * The object need not be held in the transaction. * * If there are any holds on this object's buffers (via dmu_buf_hold()), * or tx holds on the object (via dmu_tx_hold_object()), you can not * free it; it fails and returns EBUSY. * * If the object is not allocated, it fails and returns ENOENT. * * Return 0 on success, or EBUSY or ENOENT as specified above. */ int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Find the next allocated or free object. * * The objectp parameter is in-out. It will be updated to be the next * object which is allocated. Ignore objects which have not been * modified since txg. * * XXX Can only be called on a objset with no dirty data. * * Returns 0 on success, or ENOENT if there are no more objects. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg); /* * Set the number of levels on a dnode. nlevels must be greater than the * current number of levels or an EINVAL will be returned. */ int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx); /* * Set the data blocksize for an object. * * The object cannot have any blocks allocated beyond the first. If * the first block is allocated already, the new size must be greater * than the current block size. If these conditions are not met, * ENOTSUP will be returned. * * Returns 0 on success, or EBUSY if there are any holds on the object * contents, or ENOTSUP as described above. */ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx); /* * Manually set the maxblkid on a dnode. This will adjust nlevels accordingly * to accommodate the change. When calling this function, the caller must * ensure that the object's nlevels can sufficiently support the new maxblkid. */ int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, dmu_tx_t *tx); /* * Set the checksum property on a dnode. The new checksum algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx); /* * Set the compress property on a dnode. The new compression algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. */ #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus * data. As with any normal buffer, you must call dmu_buf_will_dirty() * before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release what you hold with dmu_buf_rele(). * * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp); int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp, uint32_t flags); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* * Special spill buffer support used by "SA" framework */ int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, dmu_buf_t **dbp); int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so * that it will remain in memory. You must release the hold with * dmu_buf_rele(). You must not access the dmu_buf_t after releasing * what you hold. You must have a hold on any dmu_buf_t* you pass to the DMU. * * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill * on the returned buffer before reading or writing the buffer's * db_data. The comments for those routines describe what particular * operations are valid after calling them. * * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **, int flags); int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp); int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp, int flags); int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags); int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. */ void dmu_buf_add_ref(dmu_buf_t *db, const void *tag); /* * Attempt to add a reference to a dmu buffer that is in an unknown state, * using a pointer that may have been invalidated by eviction processing. * The request will succeed if the passed in dbuf still represents the * same os/object/blkid, is ineligible for eviction, and has at least * one hold by a user other than the syncer. */ boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, uint64_t blkid, const void *tag); void dmu_buf_rele(dmu_buf_t *db, const void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); uint64_t dmu_buf_user_refcount(dmu_buf_t *db); /* * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a * range of an object. A pointer to an array of dmu_buf_t*'s is * returned (in *dbpp). * * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and * frees the array. The hold on the array of buffers MUST be released * with dmu_buf_rele_array. You can NOT release the hold on each buffer * individually with dmu_buf_rele. */ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, const void *tag); typedef void dmu_buf_evict_func_t(void *user_ptr); /* * A DMU buffer user object may be associated with a dbuf for the * duration of its lifetime. This allows the user of a dbuf (client) * to attach private data to a dbuf (e.g. in-core only data such as a * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified * when that dbuf has been evicted. Clients typically respond to the * eviction notification by freeing their private data, thus ensuring * the same lifetime for both dbuf and private data. * * The mapping from a dmu_buf_user_t to any client private data is the * client's responsibility. All current consumers of the API with private * data embed a dmu_buf_user_t as the first member of the structure for * their private data. This allows conversions between the two types * with a simple cast. Since the DMU buf user API never needs access * to the private data, other strategies can be employed if necessary * or convenient for the client (e.g. using container_of() to do the * conversion for private data that cannot have the dmu_buf_user_t as * its first member). * * Eviction callbacks are executed without the dbuf mutex held or any * other type of mechanism to guarantee that the dbuf is still available. * For this reason, users must assume the dbuf has already been freed * and not reference the dbuf from the callback context. * * Users requesting "immediate eviction" are notified as soon as the dbuf * is only referenced by dirty records (dirties == holds). Otherwise the * notification occurs after eviction processing for the dbuf begins. */ typedef struct dmu_buf_user { /* * Asynchronous user eviction callback state. */ taskq_ent_t dbu_tqent; /* Size of user data, for inclusion in dbuf_cache accounting. */ uint64_t dbu_size; /* * This instance's eviction function pointers. * * dbu_evict_func_sync is called synchronously and then * dbu_evict_func_async is executed asynchronously on a taskq. */ dmu_buf_evict_func_t *dbu_evict_func_sync; dmu_buf_evict_func_t *dbu_evict_func_async; #ifdef ZFS_DEBUG /* * Pointer to user's dbuf pointer. NULL for clients that do * not associate a dbuf with their user data. * * The dbuf pointer is cleared upon eviction so as to catch * use-after-evict bugs in clients. */ dmu_buf_t **dbu_clear_on_evict_dbufp; #endif } dmu_buf_user_t; /* * Initialize the given dmu_buf_user_t instance with the eviction function * evict_func, to be called when the user is evicted. * * NOTE: This function should only be called once on a given dmu_buf_user_t. * To allow enforcement of this, dbu must already be zeroed on entry. */ static inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp __maybe_unused) { ASSERT(dbu->dbu_evict_func_sync == NULL); ASSERT(dbu->dbu_evict_func_async == NULL); /* must have at least one evict func */ IMPLY(evict_func_sync == NULL, evict_func_async != NULL); dbu->dbu_evict_func_sync = evict_func_sync; dbu->dbu_evict_func_async = evict_func_async; taskq_init_ent(&dbu->dbu_tqent); #ifdef ZFS_DEBUG dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; #endif } /* * Attach user data to a dbuf and mark it for normal (when the dbuf's * data is cleared or its reference count goes to zero) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Attach user data to a dbuf and mark it for immediate (its dirty and * reference counts are equal) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); /* * Replace the current user of a dbuf. * * If given the current user of a dbuf, replaces the dbuf's user with * "new_user" and returns the user data pointer that was replaced. * Otherwise returns the current, and unmodified, dbuf user pointer. */ void *dmu_buf_replace_user(dmu_buf_t *db, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); /* * Remove the specified user data for a DMU buffer. * * Returns the user that was removed on success, or the current user if * another user currently owns the buffer. */ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * User data size accounting. This can be used to artifically inflate the size * of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough * to satisfy memory reclaim requests. It's not used for anything else, and * defaults to 0. */ uint64_t dmu_buf_user_size(dmu_buf_t *db); void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd); void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub); /* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); objset_t *dmu_buf_get_objset(dmu_buf_t *db); dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); void dmu_buf_dnode_exit(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); /* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called * dmu_tx_assign()). The buffer's object must be held in the tx * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); /* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has * been assigned, you can modify buffers which belong to held objects as * part of this transaction. You can't modify buffers before the * transaction has been assigned; you can't modify buffers which don't * belong to objects which this transaction holds; you can't hold * objects once the transaction has been assigned. You may hold an * object which you are going to free (with dmu_object_free()), but you * don't have to. * * You can abort the transaction before it has been assigned. * * Note that you may hold buffers (with dmu_buf_hold) at any time, * regardless of transaction state. */ #define DMU_NEW_OBJECT (-1ULL) #define DMU_OBJECT_END (-1ULL) dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn); void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); /* * To register a commit callback, dmu_tx_callback_register() must be called. * * dcb_data is a pointer to caller private data that is passed on as a * callback parameter. The caller is responsible for properly allocating and * freeing it. * * When registering a callback, the transaction must be already created, but * it cannot be committed or aborted. It can be assigned to a txg or not. * * The callback will be called after the transaction has been safely written * to stable storage and will also be called if the dmu_tx is aborted. * If there is any error which prevents the transaction from being committed to * disk, the callback will be called with a value of error != 0. * * When multiple callbacks are registered to the transaction, the callbacks * will be called in reverse order to let Lustre, the only user of commit * callback currently, take the fast path of its commit callback handling. */ typedef void dmu_tx_callback_func_t(void *dcb_data, int error); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); void dmu_tx_do_callbacks(list_t *cb_list, int error); /* * Free up the data blocks for a defined range of a file. If size is * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); /* * Convenience functions. * * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ #define DMU_READ_NO_DECRYPT 2 /* don't decrypt */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size); int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx); #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); #define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf extern uint_t zfs_max_recordsize; /* * Asynchronously try to read in the data. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_nblkptr; uint8_t doi_pad[4]; uint64_t doi_dnodesize; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void (*const arc_byteswap_func_t)(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; boolean_t ot_dbuf_metadata_cache; boolean_t ot_encrypt; const char *ot_name; } dmu_object_type_info_t; typedef const struct dmu_object_byteswap_info { arc_byteswap_func_t ob_func; const char *ob_name; } dmu_object_byteswap_info_t; extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; extern dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. * * Return 0 on success or ENOENT if object is not allocated. * * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* * Like dmu_object_info_from_db, but faster still when you only care about * the size. */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize); typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; uint64_t dds_guid; dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; uint8_t dds_redacted; char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; } dmu_objset_stats_t; /* * Get stats on a dataset. */ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); /* * Add entries to the nvlist for all the objset's properties. See * zfs_prop_table[] and zfs(1m) for details on the properties. */ void dmu_objset_stats(objset_t *os, struct nvlist *nv); /* * Get the space usage statistics for statvfs(). * * refdbytes is the amount of space "referenced" by this objset. * availbytes is the amount of space available to this objset, taking * into account quotas & reservations, assuming that no other objsets * use the space first. These values correspond to the 'referenced' and * 'available' properties, described in the zfs(1m) manpage. * * usedobjs and availobjs are the number of objects currently allocated, * and available. */ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); /* * The fsid_guid is a 56-bit ID that can change to avoid collisions. * (Contrast with the ds_guid which is a 64-bit ID that will never * change, so there is a small probability that it will collide.) */ uint64_t dmu_objset_fsid_guid(objset_t *os); /* * Get the [cm]time for an objset's snapshot dir */ inode_timespec_t dmu_objset_snap_cmtime(objset_t *os); int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); extern struct zilog *dmu_objset_zil(objset_t *os); extern struct dsl_pool *dmu_objset_pool(objset_t *os); extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); extern uint64_t dmu_objset_dnodesize(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); extern int dmu_objset_blksize(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val); extern int dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); typedef struct zfs_file_info { uint64_t zfi_user; uint64_t zfi_group; uint64_t zfi_project; uint64_t zfi_generation; } zfs_file_info_t; typedef int file_info_cb_t(dmu_object_type_t bonustype, const void *data, struct zfs_file_info *zoi); extern void dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. */ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* * Synchronous write. * If a parent zio is provided this function initiates a write on the * provided buffer as a child of the parent zio. * In the absence of a parent zio, the write is completed synchronously. * At write completion, blk is filled with the bp of the written block. * Note that while the data covered by this function will be on stable * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ /* * {zfs,zvol,ztest}_get_done() args */ typedef struct zgd { struct lwb *zgd_lwb; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct zfs_locked_range *zgd_lr; void *zgd_private; } zgd_t; typedef void dmu_sync_cb_t(zgd_t *arg, int error); int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off * Return found offset in *off. Return ESRCH for end of file. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, struct blkptr *bps, size_t *nbpsp); int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps, - boolean_t replay); + uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps); /* * Initial setup and final teardown. */ extern void dmu_init(void); extern void dmu_fini(void); typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, uint64_t object, uint64_t offset, int len); void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_diff(const char *tosnap_name, const char *fromsnap_name, zfs_file_t *fp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; extern uint_t dmu_prefetch_max; #ifdef __cplusplus } #endif #endif /* _SYS_DMU_H */ diff --git a/sys/contrib/openzfs/lib/libspl/os/linux/zone.c b/sys/contrib/openzfs/lib/libspl/os/linux/zone.c index 622d04cbc14a..f8a10bfa167a 100644 --- a/sys/contrib/openzfs/lib/libspl/os/linux/zone.c +++ b/sys/contrib/openzfs/lib/libspl/os/linux/zone.c @@ -1,62 +1,62 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2006 Ricardo Correia. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include zoneid_t getzoneid(void) { char path[PATH_MAX]; char buf[128] = { '\0' }; char *cp; int c = snprintf(path, sizeof (path), "/proc/self/ns/user"); /* This API doesn't have any error checking... */ if (c < 0 || c >= sizeof (path)) - return (0); + return (GLOBAL_ZONEID); ssize_t r = readlink(path, buf, sizeof (buf) - 1); if (r < 0) - return (0); + return (GLOBAL_ZONEID); cp = strchr(buf, '['); if (cp == NULL) - return (0); + return (GLOBAL_ZONEID); cp++; unsigned long n = strtoul(cp, NULL, 10); if (n == ULONG_MAX && errno == ERANGE) - return (0); + return (GLOBAL_ZONEID); zoneid_t z = (zoneid_t)n; return (z); } diff --git a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S index 0001e4d69055..190dbabc5ecb 100644 --- a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S +++ b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S @@ -1,2769 +1,2776 @@ /* * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * https://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Portions Copyright (c) 2022 Tino Reichardt * - modified assembly to fit into OpenZFS */ #if defined(__arm__) -#define __ARM_ARCH__ 7 -#define __ARM_MAX_ARCH__ 7 +#ifndef __ARM_ARCH +# define __ARM_ARCH__ 7 +#else +# define __ARM_ARCH__ __ARM_ARCH +#endif #if defined(__thumb2__) .syntax unified .thumb #else .code 32 #endif .text .type K256,%object .align 5 K256: .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 .word 0 @ terminator .align 5 .globl zfs_sha256_block_armv7 .type zfs_sha256_block_armv7,%function zfs_sha256_block_armv7: .Lzfs_sha256_block_armv7: #if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ zfs_sha256_block_armv7 #else adr r3,.Lzfs_sha256_block_armv7 #endif add r2,r1,r2,lsl#6 @ len to point at the end of inp stmdb sp!,{r0,r1,r2,r4-r11,lr} ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} sub r14,r3,#256+32 @ K256 sub sp,sp,#16*4 @ alloca(X[16]) .Loop: # if __ARM_ARCH__>=7 ldr r2,[r1],#4 # else ldrb r2,[r1,#3] # endif eor r3,r5,r6 @ magic eor r12,r12,r12 #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 0 # if 0==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r8,r8,ror#5 add r4,r4,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r8,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 0 add r4,r4,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 0==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r8,r8,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r8,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r11,r11,r2 @ h+=X[i] str r2,[sp,#0*4] eor r2,r9,r10 add r11,r11,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r8 add r11,r11,r12 @ h+=K256[i] eor r2,r2,r10 @ Ch(e,f,g) eor r0,r4,r4,ror#11 add r11,r11,r2 @ h+=Ch(e,f,g) #if 0==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 0<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r4,r5 @ a^b, b^c in next round #else ldr r2,[sp,#2*4] @ from future BODY_16_xx eor r12,r4,r5 @ a^b, b^c in next round ldr r1,[sp,#15*4] @ from future BODY_16_xx #endif eor r0,r0,r4,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r7,r7,r11 @ d+=h eor r3,r3,r5 @ Maj(a,b,c) add r11,r11,r0,ror#2 @ h+=Sigma0(a) @ add r11,r11,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 1 # if 1==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r7,r7,ror#5 add r11,r11,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r7,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 1 add r11,r11,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 1==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r7,r7,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r7,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r10,r10,r2 @ h+=X[i] str r2,[sp,#1*4] eor r2,r8,r9 add r10,r10,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r7 add r10,r10,r3 @ h+=K256[i] eor r2,r2,r9 @ Ch(e,f,g) eor r0,r11,r11,ror#11 add r10,r10,r2 @ h+=Ch(e,f,g) #if 1==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 1<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r11,r4 @ a^b, b^c in next round #else ldr r2,[sp,#3*4] @ from future BODY_16_xx eor r3,r11,r4 @ a^b, b^c in next round ldr r1,[sp,#0*4] @ from future BODY_16_xx #endif eor r0,r0,r11,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r6,r6,r10 @ d+=h eor r12,r12,r4 @ Maj(a,b,c) add r10,r10,r0,ror#2 @ h+=Sigma0(a) @ add r10,r10,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 2 # if 2==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r6,r6,ror#5 add r10,r10,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r6,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 2 add r10,r10,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 2==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r6,r6,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r6,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r9,r9,r2 @ h+=X[i] str r2,[sp,#2*4] eor r2,r7,r8 add r9,r9,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r6 add r9,r9,r12 @ h+=K256[i] eor r2,r2,r8 @ Ch(e,f,g) eor r0,r10,r10,ror#11 add r9,r9,r2 @ h+=Ch(e,f,g) #if 2==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 2<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r10,r11 @ a^b, b^c in next round #else ldr r2,[sp,#4*4] @ from future BODY_16_xx eor r12,r10,r11 @ a^b, b^c in next round ldr r1,[sp,#1*4] @ from future BODY_16_xx #endif eor r0,r0,r10,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r5,r5,r9 @ d+=h eor r3,r3,r11 @ Maj(a,b,c) add r9,r9,r0,ror#2 @ h+=Sigma0(a) @ add r9,r9,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 3 # if 3==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r5,r5,ror#5 add r9,r9,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r5,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 3 add r9,r9,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 3==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r5,r5,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r5,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r8,r8,r2 @ h+=X[i] str r2,[sp,#3*4] eor r2,r6,r7 add r8,r8,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r5 add r8,r8,r3 @ h+=K256[i] eor r2,r2,r7 @ Ch(e,f,g) eor r0,r9,r9,ror#11 add r8,r8,r2 @ h+=Ch(e,f,g) #if 3==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 3<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r9,r10 @ a^b, b^c in next round #else ldr r2,[sp,#5*4] @ from future BODY_16_xx eor r3,r9,r10 @ a^b, b^c in next round ldr r1,[sp,#2*4] @ from future BODY_16_xx #endif eor r0,r0,r9,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r4,r4,r8 @ d+=h eor r12,r12,r10 @ Maj(a,b,c) add r8,r8,r0,ror#2 @ h+=Sigma0(a) @ add r8,r8,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 4 # if 4==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r4,r4,ror#5 add r8,r8,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r4,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 4 add r8,r8,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 4==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r4,r4,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r4,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r7,r7,r2 @ h+=X[i] str r2,[sp,#4*4] eor r2,r5,r6 add r7,r7,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r4 add r7,r7,r12 @ h+=K256[i] eor r2,r2,r6 @ Ch(e,f,g) eor r0,r8,r8,ror#11 add r7,r7,r2 @ h+=Ch(e,f,g) #if 4==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 4<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r8,r9 @ a^b, b^c in next round #else ldr r2,[sp,#6*4] @ from future BODY_16_xx eor r12,r8,r9 @ a^b, b^c in next round ldr r1,[sp,#3*4] @ from future BODY_16_xx #endif eor r0,r0,r8,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r11,r11,r7 @ d+=h eor r3,r3,r9 @ Maj(a,b,c) add r7,r7,r0,ror#2 @ h+=Sigma0(a) @ add r7,r7,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 5 # if 5==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r11,r11,ror#5 add r7,r7,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r11,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 5 add r7,r7,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 5==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r11,r11,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r11,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r6,r6,r2 @ h+=X[i] str r2,[sp,#5*4] eor r2,r4,r5 add r6,r6,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r11 add r6,r6,r3 @ h+=K256[i] eor r2,r2,r5 @ Ch(e,f,g) eor r0,r7,r7,ror#11 add r6,r6,r2 @ h+=Ch(e,f,g) #if 5==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 5<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r7,r8 @ a^b, b^c in next round #else ldr r2,[sp,#7*4] @ from future BODY_16_xx eor r3,r7,r8 @ a^b, b^c in next round ldr r1,[sp,#4*4] @ from future BODY_16_xx #endif eor r0,r0,r7,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r10,r10,r6 @ d+=h eor r12,r12,r8 @ Maj(a,b,c) add r6,r6,r0,ror#2 @ h+=Sigma0(a) @ add r6,r6,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 6 # if 6==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r10,r10,ror#5 add r6,r6,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r10,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 6 add r6,r6,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 6==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r10,r10,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r10,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r5,r5,r2 @ h+=X[i] str r2,[sp,#6*4] eor r2,r11,r4 add r5,r5,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r10 add r5,r5,r12 @ h+=K256[i] eor r2,r2,r4 @ Ch(e,f,g) eor r0,r6,r6,ror#11 add r5,r5,r2 @ h+=Ch(e,f,g) #if 6==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 6<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r6,r7 @ a^b, b^c in next round #else ldr r2,[sp,#8*4] @ from future BODY_16_xx eor r12,r6,r7 @ a^b, b^c in next round ldr r1,[sp,#5*4] @ from future BODY_16_xx #endif eor r0,r0,r6,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r9,r9,r5 @ d+=h eor r3,r3,r7 @ Maj(a,b,c) add r5,r5,r0,ror#2 @ h+=Sigma0(a) @ add r5,r5,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 7 # if 7==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r9,r9,ror#5 add r5,r5,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r9,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 7 add r5,r5,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 7==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r9,r9,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r9,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r4,r4,r2 @ h+=X[i] str r2,[sp,#7*4] eor r2,r10,r11 add r4,r4,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r9 add r4,r4,r3 @ h+=K256[i] eor r2,r2,r11 @ Ch(e,f,g) eor r0,r5,r5,ror#11 add r4,r4,r2 @ h+=Ch(e,f,g) #if 7==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 7<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r5,r6 @ a^b, b^c in next round #else ldr r2,[sp,#9*4] @ from future BODY_16_xx eor r3,r5,r6 @ a^b, b^c in next round ldr r1,[sp,#6*4] @ from future BODY_16_xx #endif eor r0,r0,r5,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r8,r8,r4 @ d+=h eor r12,r12,r6 @ Maj(a,b,c) add r4,r4,r0,ror#2 @ h+=Sigma0(a) @ add r4,r4,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 8 # if 8==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r8,r8,ror#5 add r4,r4,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r8,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 8 add r4,r4,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 8==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r8,r8,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r8,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r11,r11,r2 @ h+=X[i] str r2,[sp,#8*4] eor r2,r9,r10 add r11,r11,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r8 add r11,r11,r12 @ h+=K256[i] eor r2,r2,r10 @ Ch(e,f,g) eor r0,r4,r4,ror#11 add r11,r11,r2 @ h+=Ch(e,f,g) #if 8==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 8<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r4,r5 @ a^b, b^c in next round #else ldr r2,[sp,#10*4] @ from future BODY_16_xx eor r12,r4,r5 @ a^b, b^c in next round ldr r1,[sp,#7*4] @ from future BODY_16_xx #endif eor r0,r0,r4,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r7,r7,r11 @ d+=h eor r3,r3,r5 @ Maj(a,b,c) add r11,r11,r0,ror#2 @ h+=Sigma0(a) @ add r11,r11,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 9 # if 9==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r7,r7,ror#5 add r11,r11,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r7,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 9 add r11,r11,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 9==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r7,r7,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r7,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r10,r10,r2 @ h+=X[i] str r2,[sp,#9*4] eor r2,r8,r9 add r10,r10,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r7 add r10,r10,r3 @ h+=K256[i] eor r2,r2,r9 @ Ch(e,f,g) eor r0,r11,r11,ror#11 add r10,r10,r2 @ h+=Ch(e,f,g) #if 9==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 9<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r11,r4 @ a^b, b^c in next round #else ldr r2,[sp,#11*4] @ from future BODY_16_xx eor r3,r11,r4 @ a^b, b^c in next round ldr r1,[sp,#8*4] @ from future BODY_16_xx #endif eor r0,r0,r11,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r6,r6,r10 @ d+=h eor r12,r12,r4 @ Maj(a,b,c) add r10,r10,r0,ror#2 @ h+=Sigma0(a) @ add r10,r10,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 10 # if 10==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r6,r6,ror#5 add r10,r10,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r6,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 10 add r10,r10,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 10==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r6,r6,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r6,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r9,r9,r2 @ h+=X[i] str r2,[sp,#10*4] eor r2,r7,r8 add r9,r9,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r6 add r9,r9,r12 @ h+=K256[i] eor r2,r2,r8 @ Ch(e,f,g) eor r0,r10,r10,ror#11 add r9,r9,r2 @ h+=Ch(e,f,g) #if 10==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 10<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r10,r11 @ a^b, b^c in next round #else ldr r2,[sp,#12*4] @ from future BODY_16_xx eor r12,r10,r11 @ a^b, b^c in next round ldr r1,[sp,#9*4] @ from future BODY_16_xx #endif eor r0,r0,r10,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r5,r5,r9 @ d+=h eor r3,r3,r11 @ Maj(a,b,c) add r9,r9,r0,ror#2 @ h+=Sigma0(a) @ add r9,r9,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 11 # if 11==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r5,r5,ror#5 add r9,r9,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r5,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 11 add r9,r9,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 11==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r5,r5,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r5,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r8,r8,r2 @ h+=X[i] str r2,[sp,#11*4] eor r2,r6,r7 add r8,r8,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r5 add r8,r8,r3 @ h+=K256[i] eor r2,r2,r7 @ Ch(e,f,g) eor r0,r9,r9,ror#11 add r8,r8,r2 @ h+=Ch(e,f,g) #if 11==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 11<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r9,r10 @ a^b, b^c in next round #else ldr r2,[sp,#13*4] @ from future BODY_16_xx eor r3,r9,r10 @ a^b, b^c in next round ldr r1,[sp,#10*4] @ from future BODY_16_xx #endif eor r0,r0,r9,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r4,r4,r8 @ d+=h eor r12,r12,r10 @ Maj(a,b,c) add r8,r8,r0,ror#2 @ h+=Sigma0(a) @ add r8,r8,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 12 # if 12==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r4,r4,ror#5 add r8,r8,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r4,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 12 add r8,r8,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 12==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r4,r4,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r4,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r7,r7,r2 @ h+=X[i] str r2,[sp,#12*4] eor r2,r5,r6 add r7,r7,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r4 add r7,r7,r12 @ h+=K256[i] eor r2,r2,r6 @ Ch(e,f,g) eor r0,r8,r8,ror#11 add r7,r7,r2 @ h+=Ch(e,f,g) #if 12==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 12<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r8,r9 @ a^b, b^c in next round #else ldr r2,[sp,#14*4] @ from future BODY_16_xx eor r12,r8,r9 @ a^b, b^c in next round ldr r1,[sp,#11*4] @ from future BODY_16_xx #endif eor r0,r0,r8,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r11,r11,r7 @ d+=h eor r3,r3,r9 @ Maj(a,b,c) add r7,r7,r0,ror#2 @ h+=Sigma0(a) @ add r7,r7,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 13 # if 13==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r11,r11,ror#5 add r7,r7,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r11,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 13 add r7,r7,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 13==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r11,r11,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r11,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r6,r6,r2 @ h+=X[i] str r2,[sp,#13*4] eor r2,r4,r5 add r6,r6,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r11 add r6,r6,r3 @ h+=K256[i] eor r2,r2,r5 @ Ch(e,f,g) eor r0,r7,r7,ror#11 add r6,r6,r2 @ h+=Ch(e,f,g) #if 13==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 13<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r7,r8 @ a^b, b^c in next round #else ldr r2,[sp,#15*4] @ from future BODY_16_xx eor r3,r7,r8 @ a^b, b^c in next round ldr r1,[sp,#12*4] @ from future BODY_16_xx #endif eor r0,r0,r7,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r10,r10,r6 @ d+=h eor r12,r12,r8 @ Maj(a,b,c) add r6,r6,r0,ror#2 @ h+=Sigma0(a) @ add r6,r6,r12 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 14 # if 14==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r10,r10,ror#5 add r6,r6,r12 @ h+=Maj(a,b,c) from the past eor r0,r0,r10,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 14 add r6,r6,r12 @ h+=Maj(a,b,c) from the past ldrb r12,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r12,lsl#8 ldrb r12,[r1],#4 orr r2,r2,r0,lsl#16 # if 14==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r10,r10,ror#5 orr r2,r2,r12,lsl#24 eor r0,r0,r10,ror#19 @ Sigma1(e) #endif ldr r12,[r14],#4 @ *K256++ add r5,r5,r2 @ h+=X[i] str r2,[sp,#14*4] eor r2,r11,r4 add r5,r5,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r10 add r5,r5,r12 @ h+=K256[i] eor r2,r2,r4 @ Ch(e,f,g) eor r0,r6,r6,ror#11 add r5,r5,r2 @ h+=Ch(e,f,g) #if 14==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 14<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r6,r7 @ a^b, b^c in next round #else ldr r2,[sp,#0*4] @ from future BODY_16_xx eor r12,r6,r7 @ a^b, b^c in next round ldr r1,[sp,#13*4] @ from future BODY_16_xx #endif eor r0,r0,r6,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r9,r9,r5 @ d+=h eor r3,r3,r7 @ Maj(a,b,c) add r5,r5,r0,ror#2 @ h+=Sigma0(a) @ add r5,r5,r3 @ h+=Maj(a,b,c) #if __ARM_ARCH__>=7 @ ldr r2,[r1],#4 @ 15 # if 15==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r9,r9,ror#5 add r5,r5,r3 @ h+=Maj(a,b,c) from the past eor r0,r0,r9,ror#19 @ Sigma1(e) # ifndef __ARMEB__ rev r2,r2 # endif #else @ ldrb r2,[r1,#3] @ 15 add r5,r5,r3 @ h+=Maj(a,b,c) from the past ldrb r3,[r1,#2] ldrb r0,[r1,#1] orr r2,r2,r3,lsl#8 ldrb r3,[r1],#4 orr r2,r2,r0,lsl#16 # if 15==15 str r1,[sp,#17*4] @ make room for r1 # endif eor r0,r9,r9,ror#5 orr r2,r2,r3,lsl#24 eor r0,r0,r9,ror#19 @ Sigma1(e) #endif ldr r3,[r14],#4 @ *K256++ add r4,r4,r2 @ h+=X[i] str r2,[sp,#15*4] eor r2,r10,r11 add r4,r4,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r9 add r4,r4,r3 @ h+=K256[i] eor r2,r2,r11 @ Ch(e,f,g) eor r0,r5,r5,ror#11 add r4,r4,r2 @ h+=Ch(e,f,g) #if 15==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 15<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r5,r6 @ a^b, b^c in next round #else ldr r2,[sp,#1*4] @ from future BODY_16_xx eor r3,r5,r6 @ a^b, b^c in next round ldr r1,[sp,#14*4] @ from future BODY_16_xx #endif eor r0,r0,r5,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r8,r8,r4 @ d+=h eor r12,r12,r6 @ Maj(a,b,c) add r4,r4,r0,ror#2 @ h+=Sigma0(a) @ add r4,r4,r12 @ h+=Maj(a,b,c) .Lrounds_16_xx: @ ldr r2,[sp,#1*4] @ 16 @ ldr r1,[sp,#14*4] mov r0,r2,ror#7 add r4,r4,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#0*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#9*4] add r12,r12,r0 eor r0,r8,r8,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r8,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r11,r11,r2 @ h+=X[i] str r2,[sp,#0*4] eor r2,r9,r10 add r11,r11,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r8 add r11,r11,r12 @ h+=K256[i] eor r2,r2,r10 @ Ch(e,f,g) eor r0,r4,r4,ror#11 add r11,r11,r2 @ h+=Ch(e,f,g) #if 16==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 16<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r4,r5 @ a^b, b^c in next round #else ldr r2,[sp,#2*4] @ from future BODY_16_xx eor r12,r4,r5 @ a^b, b^c in next round ldr r1,[sp,#15*4] @ from future BODY_16_xx #endif eor r0,r0,r4,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r7,r7,r11 @ d+=h eor r3,r3,r5 @ Maj(a,b,c) add r11,r11,r0,ror#2 @ h+=Sigma0(a) @ add r11,r11,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#2*4] @ 17 @ ldr r1,[sp,#15*4] mov r0,r2,ror#7 add r11,r11,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#1*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#10*4] add r3,r3,r0 eor r0,r7,r7,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r7,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r10,r10,r2 @ h+=X[i] str r2,[sp,#1*4] eor r2,r8,r9 add r10,r10,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r7 add r10,r10,r3 @ h+=K256[i] eor r2,r2,r9 @ Ch(e,f,g) eor r0,r11,r11,ror#11 add r10,r10,r2 @ h+=Ch(e,f,g) #if 17==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 17<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r11,r4 @ a^b, b^c in next round #else ldr r2,[sp,#3*4] @ from future BODY_16_xx eor r3,r11,r4 @ a^b, b^c in next round ldr r1,[sp,#0*4] @ from future BODY_16_xx #endif eor r0,r0,r11,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r6,r6,r10 @ d+=h eor r12,r12,r4 @ Maj(a,b,c) add r10,r10,r0,ror#2 @ h+=Sigma0(a) @ add r10,r10,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#3*4] @ 18 @ ldr r1,[sp,#0*4] mov r0,r2,ror#7 add r10,r10,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#2*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#11*4] add r12,r12,r0 eor r0,r6,r6,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r6,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r9,r9,r2 @ h+=X[i] str r2,[sp,#2*4] eor r2,r7,r8 add r9,r9,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r6 add r9,r9,r12 @ h+=K256[i] eor r2,r2,r8 @ Ch(e,f,g) eor r0,r10,r10,ror#11 add r9,r9,r2 @ h+=Ch(e,f,g) #if 18==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 18<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r10,r11 @ a^b, b^c in next round #else ldr r2,[sp,#4*4] @ from future BODY_16_xx eor r12,r10,r11 @ a^b, b^c in next round ldr r1,[sp,#1*4] @ from future BODY_16_xx #endif eor r0,r0,r10,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r5,r5,r9 @ d+=h eor r3,r3,r11 @ Maj(a,b,c) add r9,r9,r0,ror#2 @ h+=Sigma0(a) @ add r9,r9,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#4*4] @ 19 @ ldr r1,[sp,#1*4] mov r0,r2,ror#7 add r9,r9,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#3*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#12*4] add r3,r3,r0 eor r0,r5,r5,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r5,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r8,r8,r2 @ h+=X[i] str r2,[sp,#3*4] eor r2,r6,r7 add r8,r8,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r5 add r8,r8,r3 @ h+=K256[i] eor r2,r2,r7 @ Ch(e,f,g) eor r0,r9,r9,ror#11 add r8,r8,r2 @ h+=Ch(e,f,g) #if 19==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 19<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r9,r10 @ a^b, b^c in next round #else ldr r2,[sp,#5*4] @ from future BODY_16_xx eor r3,r9,r10 @ a^b, b^c in next round ldr r1,[sp,#2*4] @ from future BODY_16_xx #endif eor r0,r0,r9,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r4,r4,r8 @ d+=h eor r12,r12,r10 @ Maj(a,b,c) add r8,r8,r0,ror#2 @ h+=Sigma0(a) @ add r8,r8,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#5*4] @ 20 @ ldr r1,[sp,#2*4] mov r0,r2,ror#7 add r8,r8,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#4*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#13*4] add r12,r12,r0 eor r0,r4,r4,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r4,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r7,r7,r2 @ h+=X[i] str r2,[sp,#4*4] eor r2,r5,r6 add r7,r7,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r4 add r7,r7,r12 @ h+=K256[i] eor r2,r2,r6 @ Ch(e,f,g) eor r0,r8,r8,ror#11 add r7,r7,r2 @ h+=Ch(e,f,g) #if 20==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 20<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r8,r9 @ a^b, b^c in next round #else ldr r2,[sp,#6*4] @ from future BODY_16_xx eor r12,r8,r9 @ a^b, b^c in next round ldr r1,[sp,#3*4] @ from future BODY_16_xx #endif eor r0,r0,r8,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r11,r11,r7 @ d+=h eor r3,r3,r9 @ Maj(a,b,c) add r7,r7,r0,ror#2 @ h+=Sigma0(a) @ add r7,r7,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#6*4] @ 21 @ ldr r1,[sp,#3*4] mov r0,r2,ror#7 add r7,r7,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#5*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#14*4] add r3,r3,r0 eor r0,r11,r11,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r11,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r6,r6,r2 @ h+=X[i] str r2,[sp,#5*4] eor r2,r4,r5 add r6,r6,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r11 add r6,r6,r3 @ h+=K256[i] eor r2,r2,r5 @ Ch(e,f,g) eor r0,r7,r7,ror#11 add r6,r6,r2 @ h+=Ch(e,f,g) #if 21==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 21<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r7,r8 @ a^b, b^c in next round #else ldr r2,[sp,#7*4] @ from future BODY_16_xx eor r3,r7,r8 @ a^b, b^c in next round ldr r1,[sp,#4*4] @ from future BODY_16_xx #endif eor r0,r0,r7,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r10,r10,r6 @ d+=h eor r12,r12,r8 @ Maj(a,b,c) add r6,r6,r0,ror#2 @ h+=Sigma0(a) @ add r6,r6,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#7*4] @ 22 @ ldr r1,[sp,#4*4] mov r0,r2,ror#7 add r6,r6,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#6*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#15*4] add r12,r12,r0 eor r0,r10,r10,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r10,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r5,r5,r2 @ h+=X[i] str r2,[sp,#6*4] eor r2,r11,r4 add r5,r5,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r10 add r5,r5,r12 @ h+=K256[i] eor r2,r2,r4 @ Ch(e,f,g) eor r0,r6,r6,ror#11 add r5,r5,r2 @ h+=Ch(e,f,g) #if 22==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 22<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r6,r7 @ a^b, b^c in next round #else ldr r2,[sp,#8*4] @ from future BODY_16_xx eor r12,r6,r7 @ a^b, b^c in next round ldr r1,[sp,#5*4] @ from future BODY_16_xx #endif eor r0,r0,r6,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r9,r9,r5 @ d+=h eor r3,r3,r7 @ Maj(a,b,c) add r5,r5,r0,ror#2 @ h+=Sigma0(a) @ add r5,r5,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#8*4] @ 23 @ ldr r1,[sp,#5*4] mov r0,r2,ror#7 add r5,r5,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#7*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#0*4] add r3,r3,r0 eor r0,r9,r9,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r9,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r4,r4,r2 @ h+=X[i] str r2,[sp,#7*4] eor r2,r10,r11 add r4,r4,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r9 add r4,r4,r3 @ h+=K256[i] eor r2,r2,r11 @ Ch(e,f,g) eor r0,r5,r5,ror#11 add r4,r4,r2 @ h+=Ch(e,f,g) #if 23==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 23<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r5,r6 @ a^b, b^c in next round #else ldr r2,[sp,#9*4] @ from future BODY_16_xx eor r3,r5,r6 @ a^b, b^c in next round ldr r1,[sp,#6*4] @ from future BODY_16_xx #endif eor r0,r0,r5,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r8,r8,r4 @ d+=h eor r12,r12,r6 @ Maj(a,b,c) add r4,r4,r0,ror#2 @ h+=Sigma0(a) @ add r4,r4,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#9*4] @ 24 @ ldr r1,[sp,#6*4] mov r0,r2,ror#7 add r4,r4,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#8*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#1*4] add r12,r12,r0 eor r0,r8,r8,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r8,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r11,r11,r2 @ h+=X[i] str r2,[sp,#8*4] eor r2,r9,r10 add r11,r11,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r8 add r11,r11,r12 @ h+=K256[i] eor r2,r2,r10 @ Ch(e,f,g) eor r0,r4,r4,ror#11 add r11,r11,r2 @ h+=Ch(e,f,g) #if 24==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 24<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r4,r5 @ a^b, b^c in next round #else ldr r2,[sp,#10*4] @ from future BODY_16_xx eor r12,r4,r5 @ a^b, b^c in next round ldr r1,[sp,#7*4] @ from future BODY_16_xx #endif eor r0,r0,r4,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r7,r7,r11 @ d+=h eor r3,r3,r5 @ Maj(a,b,c) add r11,r11,r0,ror#2 @ h+=Sigma0(a) @ add r11,r11,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#10*4] @ 25 @ ldr r1,[sp,#7*4] mov r0,r2,ror#7 add r11,r11,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#9*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#2*4] add r3,r3,r0 eor r0,r7,r7,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r7,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r10,r10,r2 @ h+=X[i] str r2,[sp,#9*4] eor r2,r8,r9 add r10,r10,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r7 add r10,r10,r3 @ h+=K256[i] eor r2,r2,r9 @ Ch(e,f,g) eor r0,r11,r11,ror#11 add r10,r10,r2 @ h+=Ch(e,f,g) #if 25==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 25<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r11,r4 @ a^b, b^c in next round #else ldr r2,[sp,#11*4] @ from future BODY_16_xx eor r3,r11,r4 @ a^b, b^c in next round ldr r1,[sp,#8*4] @ from future BODY_16_xx #endif eor r0,r0,r11,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r6,r6,r10 @ d+=h eor r12,r12,r4 @ Maj(a,b,c) add r10,r10,r0,ror#2 @ h+=Sigma0(a) @ add r10,r10,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#11*4] @ 26 @ ldr r1,[sp,#8*4] mov r0,r2,ror#7 add r10,r10,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#10*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#3*4] add r12,r12,r0 eor r0,r6,r6,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r6,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r9,r9,r2 @ h+=X[i] str r2,[sp,#10*4] eor r2,r7,r8 add r9,r9,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r6 add r9,r9,r12 @ h+=K256[i] eor r2,r2,r8 @ Ch(e,f,g) eor r0,r10,r10,ror#11 add r9,r9,r2 @ h+=Ch(e,f,g) #if 26==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 26<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r10,r11 @ a^b, b^c in next round #else ldr r2,[sp,#12*4] @ from future BODY_16_xx eor r12,r10,r11 @ a^b, b^c in next round ldr r1,[sp,#9*4] @ from future BODY_16_xx #endif eor r0,r0,r10,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r5,r5,r9 @ d+=h eor r3,r3,r11 @ Maj(a,b,c) add r9,r9,r0,ror#2 @ h+=Sigma0(a) @ add r9,r9,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#12*4] @ 27 @ ldr r1,[sp,#9*4] mov r0,r2,ror#7 add r9,r9,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#11*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#4*4] add r3,r3,r0 eor r0,r5,r5,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r5,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r8,r8,r2 @ h+=X[i] str r2,[sp,#11*4] eor r2,r6,r7 add r8,r8,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r5 add r8,r8,r3 @ h+=K256[i] eor r2,r2,r7 @ Ch(e,f,g) eor r0,r9,r9,ror#11 add r8,r8,r2 @ h+=Ch(e,f,g) #if 27==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 27<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r9,r10 @ a^b, b^c in next round #else ldr r2,[sp,#13*4] @ from future BODY_16_xx eor r3,r9,r10 @ a^b, b^c in next round ldr r1,[sp,#10*4] @ from future BODY_16_xx #endif eor r0,r0,r9,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r4,r4,r8 @ d+=h eor r12,r12,r10 @ Maj(a,b,c) add r8,r8,r0,ror#2 @ h+=Sigma0(a) @ add r8,r8,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#13*4] @ 28 @ ldr r1,[sp,#10*4] mov r0,r2,ror#7 add r8,r8,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#12*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#5*4] add r12,r12,r0 eor r0,r4,r4,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r4,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r7,r7,r2 @ h+=X[i] str r2,[sp,#12*4] eor r2,r5,r6 add r7,r7,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r4 add r7,r7,r12 @ h+=K256[i] eor r2,r2,r6 @ Ch(e,f,g) eor r0,r8,r8,ror#11 add r7,r7,r2 @ h+=Ch(e,f,g) #if 28==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 28<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r8,r9 @ a^b, b^c in next round #else ldr r2,[sp,#14*4] @ from future BODY_16_xx eor r12,r8,r9 @ a^b, b^c in next round ldr r1,[sp,#11*4] @ from future BODY_16_xx #endif eor r0,r0,r8,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r11,r11,r7 @ d+=h eor r3,r3,r9 @ Maj(a,b,c) add r7,r7,r0,ror#2 @ h+=Sigma0(a) @ add r7,r7,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#14*4] @ 29 @ ldr r1,[sp,#11*4] mov r0,r2,ror#7 add r7,r7,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#13*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#6*4] add r3,r3,r0 eor r0,r11,r11,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r11,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r6,r6,r2 @ h+=X[i] str r2,[sp,#13*4] eor r2,r4,r5 add r6,r6,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r11 add r6,r6,r3 @ h+=K256[i] eor r2,r2,r5 @ Ch(e,f,g) eor r0,r7,r7,ror#11 add r6,r6,r2 @ h+=Ch(e,f,g) #if 29==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 29<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r7,r8 @ a^b, b^c in next round #else ldr r2,[sp,#15*4] @ from future BODY_16_xx eor r3,r7,r8 @ a^b, b^c in next round ldr r1,[sp,#12*4] @ from future BODY_16_xx #endif eor r0,r0,r7,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r10,r10,r6 @ d+=h eor r12,r12,r8 @ Maj(a,b,c) add r6,r6,r0,ror#2 @ h+=Sigma0(a) @ add r6,r6,r12 @ h+=Maj(a,b,c) @ ldr r2,[sp,#15*4] @ 30 @ ldr r1,[sp,#12*4] mov r0,r2,ror#7 add r6,r6,r12 @ h+=Maj(a,b,c) from the past mov r12,r1,ror#17 eor r0,r0,r2,ror#18 eor r12,r12,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#14*4] eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#7*4] add r12,r12,r0 eor r0,r10,r10,ror#5 @ from BODY_00_15 add r2,r2,r12 eor r0,r0,r10,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r12,[r14],#4 @ *K256++ add r5,r5,r2 @ h+=X[i] str r2,[sp,#14*4] eor r2,r11,r4 add r5,r5,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r10 add r5,r5,r12 @ h+=K256[i] eor r2,r2,r4 @ Ch(e,f,g) eor r0,r6,r6,ror#11 add r5,r5,r2 @ h+=Ch(e,f,g) #if 30==31 and r12,r12,#0xff cmp r12,#0xf2 @ done? #endif #if 30<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r12,r6,r7 @ a^b, b^c in next round #else ldr r2,[sp,#0*4] @ from future BODY_16_xx eor r12,r6,r7 @ a^b, b^c in next round ldr r1,[sp,#13*4] @ from future BODY_16_xx #endif eor r0,r0,r6,ror#20 @ Sigma0(a) and r3,r3,r12 @ (b^c)&=(a^b) add r9,r9,r5 @ d+=h eor r3,r3,r7 @ Maj(a,b,c) add r5,r5,r0,ror#2 @ h+=Sigma0(a) @ add r5,r5,r3 @ h+=Maj(a,b,c) @ ldr r2,[sp,#0*4] @ 31 @ ldr r1,[sp,#13*4] mov r0,r2,ror#7 add r5,r5,r3 @ h+=Maj(a,b,c) from the past mov r3,r1,ror#17 eor r0,r0,r2,ror#18 eor r3,r3,r1,ror#19 eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) ldr r2,[sp,#15*4] eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) ldr r1,[sp,#8*4] add r3,r3,r0 eor r0,r9,r9,ror#5 @ from BODY_00_15 add r2,r2,r3 eor r0,r0,r9,ror#19 @ Sigma1(e) add r2,r2,r1 @ X[i] ldr r3,[r14],#4 @ *K256++ add r4,r4,r2 @ h+=X[i] str r2,[sp,#15*4] eor r2,r10,r11 add r4,r4,r0,ror#6 @ h+=Sigma1(e) and r2,r2,r9 add r4,r4,r3 @ h+=K256[i] eor r2,r2,r11 @ Ch(e,f,g) eor r0,r5,r5,ror#11 add r4,r4,r2 @ h+=Ch(e,f,g) #if 31==31 and r3,r3,#0xff cmp r3,#0xf2 @ done? #endif #if 31<15 # if __ARM_ARCH__>=7 ldr r2,[r1],#4 @ prefetch # else ldrb r2,[r1,#3] # endif eor r3,r5,r6 @ a^b, b^c in next round #else ldr r2,[sp,#1*4] @ from future BODY_16_xx eor r3,r5,r6 @ a^b, b^c in next round ldr r1,[sp,#14*4] @ from future BODY_16_xx #endif eor r0,r0,r5,ror#20 @ Sigma0(a) and r12,r12,r3 @ (b^c)&=(a^b) add r8,r8,r4 @ d+=h eor r12,r12,r6 @ Maj(a,b,c) add r4,r4,r0,ror#2 @ h+=Sigma0(a) @ add r4,r4,r12 @ h+=Maj(a,b,c) #ifdef __thumb2__ ite eq @ Thumb2 thing, sanity check in ARM #endif ldreq r3,[sp,#16*4] @ pull ctx bne .Lrounds_16_xx add r4,r4,r12 @ h+=Maj(a,b,c) from the past ldr r0,[r3,#0] ldr r2,[r3,#4] ldr r12,[r3,#8] add r4,r4,r0 ldr r0,[r3,#12] add r5,r5,r2 ldr r2,[r3,#16] add r6,r6,r12 ldr r12,[r3,#20] add r7,r7,r0 ldr r0,[r3,#24] add r8,r8,r2 ldr r2,[r3,#28] add r9,r9,r12 ldr r1,[sp,#17*4] @ pull inp ldr r12,[sp,#18*4] @ pull inp+len add r10,r10,r0 add r11,r11,r2 stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} cmp r1,r12 sub r14,r14,#256 @ rewind Ktbl bne .Loop add sp,sp,#19*4 @ destroy frame #if __ARM_ARCH__>=5 ldmia sp!,{r4-r11,pc} #else ldmia sp!,{r4-r11,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif .size zfs_sha256_block_armv7,.-zfs_sha256_block_armv7 .arch armv7-a .fpu neon .globl zfs_sha256_block_neon .type zfs_sha256_block_neon,%function .align 5 .skip 16 zfs_sha256_block_neon: .LNEON: stmdb sp!,{r4-r12,lr} sub r11,sp,#16*4+16 +#if __ARM_ARCH__ >=7 adr r14,K256 +#else + ldr r14,=K256 +#endif bic r11,r11,#15 @ align for 128-bit stores mov r12,sp mov sp,r11 @ alloca add r2,r1,r2,lsl#6 @ len to point at the end of inp vld1.8 {q0},[r1]! vld1.8 {q1},[r1]! vld1.8 {q2},[r1]! vld1.8 {q3},[r1]! vld1.32 {q8},[r14,:128]! vld1.32 {q9},[r14,:128]! vld1.32 {q10},[r14,:128]! vld1.32 {q11},[r14,:128]! vrev32.8 q0,q0 @ yes, even on str r0,[sp,#64] vrev32.8 q1,q1 @ big-endian str r1,[sp,#68] mov r1,sp vrev32.8 q2,q2 str r2,[sp,#72] vrev32.8 q3,q3 str r12,[sp,#76] @ save original sp vadd.i32 q8,q8,q0 vadd.i32 q9,q9,q1 vst1.32 {q8},[r1,:128]! vadd.i32 q10,q10,q2 vst1.32 {q9},[r1,:128]! vadd.i32 q11,q11,q3 vst1.32 {q10},[r1,:128]! vst1.32 {q11},[r1,:128]! ldmia r0,{r4-r11} sub r1,r1,#64 ldr r2,[sp,#0] eor r12,r12,r12 eor r3,r5,r6 b .L_00_48 .align 4 .L_00_48: vext.8 q8,q0,q1,#4 add r11,r11,r2 eor r2,r9,r10 eor r0,r8,r8,ror#5 vext.8 q9,q2,q3,#4 add r4,r4,r12 and r2,r2,r8 eor r12,r0,r8,ror#19 vshr.u32 q10,q8,#7 eor r0,r4,r4,ror#11 eor r2,r2,r10 vadd.i32 q0,q0,q9 add r11,r11,r12,ror#6 eor r12,r4,r5 vshr.u32 q9,q8,#3 eor r0,r0,r4,ror#20 add r11,r11,r2 vsli.32 q10,q8,#25 ldr r2,[sp,#4] and r3,r3,r12 vshr.u32 q11,q8,#18 add r7,r7,r11 add r11,r11,r0,ror#2 eor r3,r3,r5 veor q9,q9,q10 add r10,r10,r2 vsli.32 q11,q8,#14 eor r2,r8,r9 eor r0,r7,r7,ror#5 vshr.u32 d24,d7,#17 add r11,r11,r3 and r2,r2,r7 veor q9,q9,q11 eor r3,r0,r7,ror#19 eor r0,r11,r11,ror#11 vsli.32 d24,d7,#15 eor r2,r2,r9 add r10,r10,r3,ror#6 vshr.u32 d25,d7,#10 eor r3,r11,r4 eor r0,r0,r11,ror#20 vadd.i32 q0,q0,q9 add r10,r10,r2 ldr r2,[sp,#8] veor d25,d25,d24 and r12,r12,r3 add r6,r6,r10 vshr.u32 d24,d7,#19 add r10,r10,r0,ror#2 eor r12,r12,r4 vsli.32 d24,d7,#13 add r9,r9,r2 eor r2,r7,r8 veor d25,d25,d24 eor r0,r6,r6,ror#5 add r10,r10,r12 vadd.i32 d0,d0,d25 and r2,r2,r6 eor r12,r0,r6,ror#19 vshr.u32 d24,d0,#17 eor r0,r10,r10,ror#11 eor r2,r2,r8 vsli.32 d24,d0,#15 add r9,r9,r12,ror#6 eor r12,r10,r11 vshr.u32 d25,d0,#10 eor r0,r0,r10,ror#20 add r9,r9,r2 veor d25,d25,d24 ldr r2,[sp,#12] and r3,r3,r12 vshr.u32 d24,d0,#19 add r5,r5,r9 add r9,r9,r0,ror#2 eor r3,r3,r11 vld1.32 {q8},[r14,:128]! add r8,r8,r2 vsli.32 d24,d0,#13 eor r2,r6,r7 eor r0,r5,r5,ror#5 veor d25,d25,d24 add r9,r9,r3 and r2,r2,r5 vadd.i32 d1,d1,d25 eor r3,r0,r5,ror#19 eor r0,r9,r9,ror#11 vadd.i32 q8,q8,q0 eor r2,r2,r7 add r8,r8,r3,ror#6 eor r3,r9,r10 eor r0,r0,r9,ror#20 add r8,r8,r2 ldr r2,[sp,#16] and r12,r12,r3 add r4,r4,r8 vst1.32 {q8},[r1,:128]! add r8,r8,r0,ror#2 eor r12,r12,r10 vext.8 q8,q1,q2,#4 add r7,r7,r2 eor r2,r5,r6 eor r0,r4,r4,ror#5 vext.8 q9,q3,q0,#4 add r8,r8,r12 and r2,r2,r4 eor r12,r0,r4,ror#19 vshr.u32 q10,q8,#7 eor r0,r8,r8,ror#11 eor r2,r2,r6 vadd.i32 q1,q1,q9 add r7,r7,r12,ror#6 eor r12,r8,r9 vshr.u32 q9,q8,#3 eor r0,r0,r8,ror#20 add r7,r7,r2 vsli.32 q10,q8,#25 ldr r2,[sp,#20] and r3,r3,r12 vshr.u32 q11,q8,#18 add r11,r11,r7 add r7,r7,r0,ror#2 eor r3,r3,r9 veor q9,q9,q10 add r6,r6,r2 vsli.32 q11,q8,#14 eor r2,r4,r5 eor r0,r11,r11,ror#5 vshr.u32 d24,d1,#17 add r7,r7,r3 and r2,r2,r11 veor q9,q9,q11 eor r3,r0,r11,ror#19 eor r0,r7,r7,ror#11 vsli.32 d24,d1,#15 eor r2,r2,r5 add r6,r6,r3,ror#6 vshr.u32 d25,d1,#10 eor r3,r7,r8 eor r0,r0,r7,ror#20 vadd.i32 q1,q1,q9 add r6,r6,r2 ldr r2,[sp,#24] veor d25,d25,d24 and r12,r12,r3 add r10,r10,r6 vshr.u32 d24,d1,#19 add r6,r6,r0,ror#2 eor r12,r12,r8 vsli.32 d24,d1,#13 add r5,r5,r2 eor r2,r11,r4 veor d25,d25,d24 eor r0,r10,r10,ror#5 add r6,r6,r12 vadd.i32 d2,d2,d25 and r2,r2,r10 eor r12,r0,r10,ror#19 vshr.u32 d24,d2,#17 eor r0,r6,r6,ror#11 eor r2,r2,r4 vsli.32 d24,d2,#15 add r5,r5,r12,ror#6 eor r12,r6,r7 vshr.u32 d25,d2,#10 eor r0,r0,r6,ror#20 add r5,r5,r2 veor d25,d25,d24 ldr r2,[sp,#28] and r3,r3,r12 vshr.u32 d24,d2,#19 add r9,r9,r5 add r5,r5,r0,ror#2 eor r3,r3,r7 vld1.32 {q8},[r14,:128]! add r4,r4,r2 vsli.32 d24,d2,#13 eor r2,r10,r11 eor r0,r9,r9,ror#5 veor d25,d25,d24 add r5,r5,r3 and r2,r2,r9 vadd.i32 d3,d3,d25 eor r3,r0,r9,ror#19 eor r0,r5,r5,ror#11 vadd.i32 q8,q8,q1 eor r2,r2,r11 add r4,r4,r3,ror#6 eor r3,r5,r6 eor r0,r0,r5,ror#20 add r4,r4,r2 ldr r2,[sp,#32] and r12,r12,r3 add r8,r8,r4 vst1.32 {q8},[r1,:128]! add r4,r4,r0,ror#2 eor r12,r12,r6 vext.8 q8,q2,q3,#4 add r11,r11,r2 eor r2,r9,r10 eor r0,r8,r8,ror#5 vext.8 q9,q0,q1,#4 add r4,r4,r12 and r2,r2,r8 eor r12,r0,r8,ror#19 vshr.u32 q10,q8,#7 eor r0,r4,r4,ror#11 eor r2,r2,r10 vadd.i32 q2,q2,q9 add r11,r11,r12,ror#6 eor r12,r4,r5 vshr.u32 q9,q8,#3 eor r0,r0,r4,ror#20 add r11,r11,r2 vsli.32 q10,q8,#25 ldr r2,[sp,#36] and r3,r3,r12 vshr.u32 q11,q8,#18 add r7,r7,r11 add r11,r11,r0,ror#2 eor r3,r3,r5 veor q9,q9,q10 add r10,r10,r2 vsli.32 q11,q8,#14 eor r2,r8,r9 eor r0,r7,r7,ror#5 vshr.u32 d24,d3,#17 add r11,r11,r3 and r2,r2,r7 veor q9,q9,q11 eor r3,r0,r7,ror#19 eor r0,r11,r11,ror#11 vsli.32 d24,d3,#15 eor r2,r2,r9 add r10,r10,r3,ror#6 vshr.u32 d25,d3,#10 eor r3,r11,r4 eor r0,r0,r11,ror#20 vadd.i32 q2,q2,q9 add r10,r10,r2 ldr r2,[sp,#40] veor d25,d25,d24 and r12,r12,r3 add r6,r6,r10 vshr.u32 d24,d3,#19 add r10,r10,r0,ror#2 eor r12,r12,r4 vsli.32 d24,d3,#13 add r9,r9,r2 eor r2,r7,r8 veor d25,d25,d24 eor r0,r6,r6,ror#5 add r10,r10,r12 vadd.i32 d4,d4,d25 and r2,r2,r6 eor r12,r0,r6,ror#19 vshr.u32 d24,d4,#17 eor r0,r10,r10,ror#11 eor r2,r2,r8 vsli.32 d24,d4,#15 add r9,r9,r12,ror#6 eor r12,r10,r11 vshr.u32 d25,d4,#10 eor r0,r0,r10,ror#20 add r9,r9,r2 veor d25,d25,d24 ldr r2,[sp,#44] and r3,r3,r12 vshr.u32 d24,d4,#19 add r5,r5,r9 add r9,r9,r0,ror#2 eor r3,r3,r11 vld1.32 {q8},[r14,:128]! add r8,r8,r2 vsli.32 d24,d4,#13 eor r2,r6,r7 eor r0,r5,r5,ror#5 veor d25,d25,d24 add r9,r9,r3 and r2,r2,r5 vadd.i32 d5,d5,d25 eor r3,r0,r5,ror#19 eor r0,r9,r9,ror#11 vadd.i32 q8,q8,q2 eor r2,r2,r7 add r8,r8,r3,ror#6 eor r3,r9,r10 eor r0,r0,r9,ror#20 add r8,r8,r2 ldr r2,[sp,#48] and r12,r12,r3 add r4,r4,r8 vst1.32 {q8},[r1,:128]! add r8,r8,r0,ror#2 eor r12,r12,r10 vext.8 q8,q3,q0,#4 add r7,r7,r2 eor r2,r5,r6 eor r0,r4,r4,ror#5 vext.8 q9,q1,q2,#4 add r8,r8,r12 and r2,r2,r4 eor r12,r0,r4,ror#19 vshr.u32 q10,q8,#7 eor r0,r8,r8,ror#11 eor r2,r2,r6 vadd.i32 q3,q3,q9 add r7,r7,r12,ror#6 eor r12,r8,r9 vshr.u32 q9,q8,#3 eor r0,r0,r8,ror#20 add r7,r7,r2 vsli.32 q10,q8,#25 ldr r2,[sp,#52] and r3,r3,r12 vshr.u32 q11,q8,#18 add r11,r11,r7 add r7,r7,r0,ror#2 eor r3,r3,r9 veor q9,q9,q10 add r6,r6,r2 vsli.32 q11,q8,#14 eor r2,r4,r5 eor r0,r11,r11,ror#5 vshr.u32 d24,d5,#17 add r7,r7,r3 and r2,r2,r11 veor q9,q9,q11 eor r3,r0,r11,ror#19 eor r0,r7,r7,ror#11 vsli.32 d24,d5,#15 eor r2,r2,r5 add r6,r6,r3,ror#6 vshr.u32 d25,d5,#10 eor r3,r7,r8 eor r0,r0,r7,ror#20 vadd.i32 q3,q3,q9 add r6,r6,r2 ldr r2,[sp,#56] veor d25,d25,d24 and r12,r12,r3 add r10,r10,r6 vshr.u32 d24,d5,#19 add r6,r6,r0,ror#2 eor r12,r12,r8 vsli.32 d24,d5,#13 add r5,r5,r2 eor r2,r11,r4 veor d25,d25,d24 eor r0,r10,r10,ror#5 add r6,r6,r12 vadd.i32 d6,d6,d25 and r2,r2,r10 eor r12,r0,r10,ror#19 vshr.u32 d24,d6,#17 eor r0,r6,r6,ror#11 eor r2,r2,r4 vsli.32 d24,d6,#15 add r5,r5,r12,ror#6 eor r12,r6,r7 vshr.u32 d25,d6,#10 eor r0,r0,r6,ror#20 add r5,r5,r2 veor d25,d25,d24 ldr r2,[sp,#60] and r3,r3,r12 vshr.u32 d24,d6,#19 add r9,r9,r5 add r5,r5,r0,ror#2 eor r3,r3,r7 vld1.32 {q8},[r14,:128]! add r4,r4,r2 vsli.32 d24,d6,#13 eor r2,r10,r11 eor r0,r9,r9,ror#5 veor d25,d25,d24 add r5,r5,r3 and r2,r2,r9 vadd.i32 d7,d7,d25 eor r3,r0,r9,ror#19 eor r0,r5,r5,ror#11 vadd.i32 q8,q8,q3 eor r2,r2,r11 add r4,r4,r3,ror#6 eor r3,r5,r6 eor r0,r0,r5,ror#20 add r4,r4,r2 ldr r2,[r14] and r12,r12,r3 add r8,r8,r4 vst1.32 {q8},[r1,:128]! add r4,r4,r0,ror#2 eor r12,r12,r6 teq r2,#0 @ check for K256 terminator ldr r2,[sp,#0] sub r1,r1,#64 bne .L_00_48 ldr r1,[sp,#68] ldr r0,[sp,#72] sub r14,r14,#256 @ rewind r14 teq r1,r0 it eq subeq r1,r1,#64 @ avoid SEGV vld1.8 {q0},[r1]! @ load next input block vld1.8 {q1},[r1]! vld1.8 {q2},[r1]! vld1.8 {q3},[r1]! it ne strne r1,[sp,#68] mov r1,sp add r11,r11,r2 eor r2,r9,r10 eor r0,r8,r8,ror#5 add r4,r4,r12 vld1.32 {q8},[r14,:128]! and r2,r2,r8 eor r12,r0,r8,ror#19 eor r0,r4,r4,ror#11 eor r2,r2,r10 vrev32.8 q0,q0 add r11,r11,r12,ror#6 eor r12,r4,r5 eor r0,r0,r4,ror#20 add r11,r11,r2 vadd.i32 q8,q8,q0 ldr r2,[sp,#4] and r3,r3,r12 add r7,r7,r11 add r11,r11,r0,ror#2 eor r3,r3,r5 add r10,r10,r2 eor r2,r8,r9 eor r0,r7,r7,ror#5 add r11,r11,r3 and r2,r2,r7 eor r3,r0,r7,ror#19 eor r0,r11,r11,ror#11 eor r2,r2,r9 add r10,r10,r3,ror#6 eor r3,r11,r4 eor r0,r0,r11,ror#20 add r10,r10,r2 ldr r2,[sp,#8] and r12,r12,r3 add r6,r6,r10 add r10,r10,r0,ror#2 eor r12,r12,r4 add r9,r9,r2 eor r2,r7,r8 eor r0,r6,r6,ror#5 add r10,r10,r12 and r2,r2,r6 eor r12,r0,r6,ror#19 eor r0,r10,r10,ror#11 eor r2,r2,r8 add r9,r9,r12,ror#6 eor r12,r10,r11 eor r0,r0,r10,ror#20 add r9,r9,r2 ldr r2,[sp,#12] and r3,r3,r12 add r5,r5,r9 add r9,r9,r0,ror#2 eor r3,r3,r11 add r8,r8,r2 eor r2,r6,r7 eor r0,r5,r5,ror#5 add r9,r9,r3 and r2,r2,r5 eor r3,r0,r5,ror#19 eor r0,r9,r9,ror#11 eor r2,r2,r7 add r8,r8,r3,ror#6 eor r3,r9,r10 eor r0,r0,r9,ror#20 add r8,r8,r2 ldr r2,[sp,#16] and r12,r12,r3 add r4,r4,r8 add r8,r8,r0,ror#2 eor r12,r12,r10 vst1.32 {q8},[r1,:128]! add r7,r7,r2 eor r2,r5,r6 eor r0,r4,r4,ror#5 add r8,r8,r12 vld1.32 {q8},[r14,:128]! and r2,r2,r4 eor r12,r0,r4,ror#19 eor r0,r8,r8,ror#11 eor r2,r2,r6 vrev32.8 q1,q1 add r7,r7,r12,ror#6 eor r12,r8,r9 eor r0,r0,r8,ror#20 add r7,r7,r2 vadd.i32 q8,q8,q1 ldr r2,[sp,#20] and r3,r3,r12 add r11,r11,r7 add r7,r7,r0,ror#2 eor r3,r3,r9 add r6,r6,r2 eor r2,r4,r5 eor r0,r11,r11,ror#5 add r7,r7,r3 and r2,r2,r11 eor r3,r0,r11,ror#19 eor r0,r7,r7,ror#11 eor r2,r2,r5 add r6,r6,r3,ror#6 eor r3,r7,r8 eor r0,r0,r7,ror#20 add r6,r6,r2 ldr r2,[sp,#24] and r12,r12,r3 add r10,r10,r6 add r6,r6,r0,ror#2 eor r12,r12,r8 add r5,r5,r2 eor r2,r11,r4 eor r0,r10,r10,ror#5 add r6,r6,r12 and r2,r2,r10 eor r12,r0,r10,ror#19 eor r0,r6,r6,ror#11 eor r2,r2,r4 add r5,r5,r12,ror#6 eor r12,r6,r7 eor r0,r0,r6,ror#20 add r5,r5,r2 ldr r2,[sp,#28] and r3,r3,r12 add r9,r9,r5 add r5,r5,r0,ror#2 eor r3,r3,r7 add r4,r4,r2 eor r2,r10,r11 eor r0,r9,r9,ror#5 add r5,r5,r3 and r2,r2,r9 eor r3,r0,r9,ror#19 eor r0,r5,r5,ror#11 eor r2,r2,r11 add r4,r4,r3,ror#6 eor r3,r5,r6 eor r0,r0,r5,ror#20 add r4,r4,r2 ldr r2,[sp,#32] and r12,r12,r3 add r8,r8,r4 add r4,r4,r0,ror#2 eor r12,r12,r6 vst1.32 {q8},[r1,:128]! add r11,r11,r2 eor r2,r9,r10 eor r0,r8,r8,ror#5 add r4,r4,r12 vld1.32 {q8},[r14,:128]! and r2,r2,r8 eor r12,r0,r8,ror#19 eor r0,r4,r4,ror#11 eor r2,r2,r10 vrev32.8 q2,q2 add r11,r11,r12,ror#6 eor r12,r4,r5 eor r0,r0,r4,ror#20 add r11,r11,r2 vadd.i32 q8,q8,q2 ldr r2,[sp,#36] and r3,r3,r12 add r7,r7,r11 add r11,r11,r0,ror#2 eor r3,r3,r5 add r10,r10,r2 eor r2,r8,r9 eor r0,r7,r7,ror#5 add r11,r11,r3 and r2,r2,r7 eor r3,r0,r7,ror#19 eor r0,r11,r11,ror#11 eor r2,r2,r9 add r10,r10,r3,ror#6 eor r3,r11,r4 eor r0,r0,r11,ror#20 add r10,r10,r2 ldr r2,[sp,#40] and r12,r12,r3 add r6,r6,r10 add r10,r10,r0,ror#2 eor r12,r12,r4 add r9,r9,r2 eor r2,r7,r8 eor r0,r6,r6,ror#5 add r10,r10,r12 and r2,r2,r6 eor r12,r0,r6,ror#19 eor r0,r10,r10,ror#11 eor r2,r2,r8 add r9,r9,r12,ror#6 eor r12,r10,r11 eor r0,r0,r10,ror#20 add r9,r9,r2 ldr r2,[sp,#44] and r3,r3,r12 add r5,r5,r9 add r9,r9,r0,ror#2 eor r3,r3,r11 add r8,r8,r2 eor r2,r6,r7 eor r0,r5,r5,ror#5 add r9,r9,r3 and r2,r2,r5 eor r3,r0,r5,ror#19 eor r0,r9,r9,ror#11 eor r2,r2,r7 add r8,r8,r3,ror#6 eor r3,r9,r10 eor r0,r0,r9,ror#20 add r8,r8,r2 ldr r2,[sp,#48] and r12,r12,r3 add r4,r4,r8 add r8,r8,r0,ror#2 eor r12,r12,r10 vst1.32 {q8},[r1,:128]! add r7,r7,r2 eor r2,r5,r6 eor r0,r4,r4,ror#5 add r8,r8,r12 vld1.32 {q8},[r14,:128]! and r2,r2,r4 eor r12,r0,r4,ror#19 eor r0,r8,r8,ror#11 eor r2,r2,r6 vrev32.8 q3,q3 add r7,r7,r12,ror#6 eor r12,r8,r9 eor r0,r0,r8,ror#20 add r7,r7,r2 vadd.i32 q8,q8,q3 ldr r2,[sp,#52] and r3,r3,r12 add r11,r11,r7 add r7,r7,r0,ror#2 eor r3,r3,r9 add r6,r6,r2 eor r2,r4,r5 eor r0,r11,r11,ror#5 add r7,r7,r3 and r2,r2,r11 eor r3,r0,r11,ror#19 eor r0,r7,r7,ror#11 eor r2,r2,r5 add r6,r6,r3,ror#6 eor r3,r7,r8 eor r0,r0,r7,ror#20 add r6,r6,r2 ldr r2,[sp,#56] and r12,r12,r3 add r10,r10,r6 add r6,r6,r0,ror#2 eor r12,r12,r8 add r5,r5,r2 eor r2,r11,r4 eor r0,r10,r10,ror#5 add r6,r6,r12 and r2,r2,r10 eor r12,r0,r10,ror#19 eor r0,r6,r6,ror#11 eor r2,r2,r4 add r5,r5,r12,ror#6 eor r12,r6,r7 eor r0,r0,r6,ror#20 add r5,r5,r2 ldr r2,[sp,#60] and r3,r3,r12 add r9,r9,r5 add r5,r5,r0,ror#2 eor r3,r3,r7 add r4,r4,r2 eor r2,r10,r11 eor r0,r9,r9,ror#5 add r5,r5,r3 and r2,r2,r9 eor r3,r0,r9,ror#19 eor r0,r5,r5,ror#11 eor r2,r2,r11 add r4,r4,r3,ror#6 eor r3,r5,r6 eor r0,r0,r5,ror#20 add r4,r4,r2 ldr r2,[sp,#64] and r12,r12,r3 add r8,r8,r4 add r4,r4,r0,ror#2 eor r12,r12,r6 vst1.32 {q8},[r1,:128]! ldr r0,[r2,#0] add r4,r4,r12 @ h+=Maj(a,b,c) from the past ldr r12,[r2,#4] ldr r3,[r2,#8] ldr r1,[r2,#12] add r4,r4,r0 @ accumulate ldr r0,[r2,#16] add r5,r5,r12 ldr r12,[r2,#20] add r6,r6,r3 ldr r3,[r2,#24] add r7,r7,r1 ldr r1,[r2,#28] add r8,r8,r0 str r4,[r2],#4 add r9,r9,r12 str r5,[r2],#4 add r10,r10,r3 str r6,[r2],#4 add r11,r11,r1 str r7,[r2],#4 stmia r2,{r8-r11} ittte ne movne r1,sp ldrne r2,[sp,#0] eorne r12,r12,r12 ldreq sp,[sp,#76] @ restore original sp itt ne eorne r3,r5,r6 bne .L_00_48 ldmia sp!,{r4-r12,pc} .size zfs_sha256_block_neon,.-zfs_sha256_block_neon # if defined(__thumb2__) # define INST(a,b,c,d) .byte c,d|0xc,a,b # else # define INST(a,b,c,d) .byte a,b,c,d # endif .globl zfs_sha256_block_armv8 .type zfs_sha256_block_armv8,%function .align 5 zfs_sha256_block_armv8: .LARMv8: vld1.32 {q0,q1},[r0] sub r3,r3,#256+32 add r2,r1,r2,lsl#6 @ len to point at the end of inp b .Loop_v8 .align 4 .Loop_v8: vld1.8 {q8-q9},[r1]! vld1.8 {q10-q11},[r1]! vld1.32 {q12},[r3]! vrev32.8 q8,q8 vrev32.8 q9,q9 vrev32.8 q10,q10 vrev32.8 q11,q11 vmov q14,q0 @ offload vmov q15,q1 teq r1,r2 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 vmov q2,q0 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 vmov q2,q0 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q10 INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 vmov q2,q0 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q11 INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 vmov q2,q0 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 vmov q2,q0 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 vmov q2,q0 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q10 INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 vmov q2,q0 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q11 INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 vmov q2,q0 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 vmov q2,q0 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 vmov q2,q0 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q10 INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 vmov q2,q0 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q11 INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 vmov q2,q0 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 vld1.32 {q13},[r3]! vadd.i32 q12,q12,q8 vmov q2,q0 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 vld1.32 {q12},[r3]! vadd.i32 q13,q13,q9 vmov q2,q0 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 vld1.32 {q13},[r3] vadd.i32 q12,q12,q10 sub r3,r3,#256-16 @ rewind vmov q2,q0 INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 vadd.i32 q13,q13,q11 vmov q2,q0 INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 vadd.i32 q0,q0,q14 vadd.i32 q1,q1,q15 it ne bne .Loop_v8 vst1.32 {q0,q1},[r0] bx lr @ bx lr .size zfs_sha256_block_armv8,.-zfs_sha256_block_armv8 #endif diff --git a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S index a4c804033b92..499cb6df9567 100644 --- a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S +++ b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S @@ -1,1822 +1,1825 @@ /* * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * https://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Portions Copyright (c) 2022 Tino Reichardt * - modified assembly to fit into OpenZFS */ #if defined(__arm__) -#define __ARM_ARCH__ 7 -#define __ARM_MAX_ARCH__ 7 +#ifndef __ARM_ARCH +# define __ARM_ARCH__ 7 +#else +# define __ARM_ARCH__ __ARM_ARCH +#endif #ifndef __KERNEL__ # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} # define VFP_ABI_POP vldmia sp!,{d8-d15} #else # define VFP_ABI_PUSH # define VFP_ABI_POP #endif #ifdef __ARMEL__ # define LO 0 # define HI 4 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 #else # define HI 0 # define LO 4 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 #endif #if defined(__thumb2__) .syntax unified .thumb # define adrl adr #else .code 32 #endif .text .type K512,%object .align 5 K512: WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 .word 0 @ terminator .align 5 .globl zfs_sha512_block_armv7 .type zfs_sha512_block_armv7,%function zfs_sha512_block_armv7: .Lzfs_sha512_block_armv7: #if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ zfs_sha512_block_armv7 #else adr r3,.Lzfs_sha512_block_armv7 #endif add r2,r1,r2,lsl#7 @ len to point at the end of inp stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} sub r14,r3,#672 @ K512 sub sp,sp,#9*8 ldr r7,[r0,#32+LO] ldr r8,[r0,#32+HI] ldr r9, [r0,#48+LO] ldr r10, [r0,#48+HI] ldr r11, [r0,#56+LO] ldr r12, [r0,#56+HI] .Loop: str r9, [sp,#48+0] str r10, [sp,#48+4] str r11, [sp,#56+0] str r12, [sp,#56+4] ldr r5,[r0,#0+LO] ldr r6,[r0,#0+HI] ldr r3,[r0,#8+LO] ldr r4,[r0,#8+HI] ldr r9, [r0,#16+LO] ldr r10, [r0,#16+HI] ldr r11, [r0,#24+LO] ldr r12, [r0,#24+HI] str r3,[sp,#8+0] str r4,[sp,#8+4] str r9, [sp,#16+0] str r10, [sp,#16+4] str r11, [sp,#24+0] str r12, [sp,#24+4] ldr r3,[r0,#40+LO] ldr r4,[r0,#40+HI] str r3,[sp,#40+0] str r4,[sp,#40+4] .L00_15: #if __ARM_ARCH__<7 ldrb r3,[r1,#7] ldrb r9, [r1,#6] ldrb r10, [r1,#5] ldrb r11, [r1,#4] ldrb r4,[r1,#3] ldrb r12, [r1,#2] orr r3,r3,r9,lsl#8 ldrb r9, [r1,#1] orr r3,r3,r10,lsl#16 ldrb r10, [r1],#8 orr r3,r3,r11,lsl#24 orr r4,r4,r12,lsl#8 orr r4,r4,r9,lsl#16 orr r4,r4,r10,lsl#24 #else ldr r3,[r1,#4] ldr r4,[r1],#8 #ifdef __ARMEL__ rev r3,r3 rev r4,r4 #endif #endif @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 mov r9,r7,lsr#14 str r3,[sp,#64+0] mov r10,r8,lsr#14 str r4,[sp,#64+4] eor r9,r9,r8,lsl#18 ldr r11,[sp,#56+0] @ h.lo eor r10,r10,r7,lsl#18 ldr r12,[sp,#56+4] @ h.hi eor r9,r9,r7,lsr#18 eor r10,r10,r8,lsr#18 eor r9,r9,r8,lsl#14 eor r10,r10,r7,lsl#14 eor r9,r9,r8,lsr#9 eor r10,r10,r7,lsr#9 eor r9,r9,r7,lsl#23 eor r10,r10,r8,lsl#23 @ Sigma1(e) adds r3,r3,r9 ldr r9,[sp,#40+0] @ f.lo adc r4,r4,r10 @ T += Sigma1(e) ldr r10,[sp,#40+4] @ f.hi adds r3,r3,r11 ldr r11,[sp,#48+0] @ g.lo adc r4,r4,r12 @ T += h ldr r12,[sp,#48+4] @ g.hi eor r9,r9,r11 str r7,[sp,#32+0] eor r10,r10,r12 str r8,[sp,#32+4] and r9,r9,r7 str r5,[sp,#0+0] and r10,r10,r8 str r6,[sp,#0+4] eor r9,r9,r11 ldr r11,[r14,#LO] @ K[i].lo eor r10,r10,r12 @ Ch(e,f,g) ldr r12,[r14,#HI] @ K[i].hi adds r3,r3,r9 ldr r7,[sp,#24+0] @ d.lo adc r4,r4,r10 @ T += Ch(e,f,g) ldr r8,[sp,#24+4] @ d.hi adds r3,r3,r11 and r9,r11,#0xff adc r4,r4,r12 @ T += K[i] adds r7,r7,r3 ldr r11,[sp,#8+0] @ b.lo adc r8,r8,r4 @ d += T teq r9,#148 ldr r12,[sp,#16+0] @ c.lo #ifdef __thumb2__ it eq @ Thumb2 thing, sanity check in ARM #endif orreq r14,r14,#1 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 mov r9,r5,lsr#28 mov r10,r6,lsr#28 eor r9,r9,r6,lsl#4 eor r10,r10,r5,lsl#4 eor r9,r9,r6,lsr#2 eor r10,r10,r5,lsr#2 eor r9,r9,r5,lsl#30 eor r10,r10,r6,lsl#30 eor r9,r9,r6,lsr#7 eor r10,r10,r5,lsr#7 eor r9,r9,r5,lsl#25 eor r10,r10,r6,lsl#25 @ Sigma0(a) adds r3,r3,r9 and r9,r5,r11 adc r4,r4,r10 @ T += Sigma0(a) ldr r10,[sp,#8+4] @ b.hi orr r5,r5,r11 ldr r11,[sp,#16+4] @ c.hi and r5,r5,r12 and r12,r6,r10 orr r6,r6,r10 orr r5,r5,r9 @ Maj(a,b,c).lo and r6,r6,r11 adds r5,r5,r3 orr r6,r6,r12 @ Maj(a,b,c).hi sub sp,sp,#8 adc r6,r6,r4 @ h += T tst r14,#1 add r14,r14,#8 tst r14,#1 beq .L00_15 ldr r9,[sp,#184+0] ldr r10,[sp,#184+4] bic r14,r14,#1 .L16_79: @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 mov r3,r9,lsr#1 ldr r11,[sp,#80+0] mov r4,r10,lsr#1 ldr r12,[sp,#80+4] eor r3,r3,r10,lsl#31 eor r4,r4,r9,lsl#31 eor r3,r3,r9,lsr#8 eor r4,r4,r10,lsr#8 eor r3,r3,r10,lsl#24 eor r4,r4,r9,lsl#24 eor r3,r3,r9,lsr#7 eor r4,r4,r10,lsr#7 eor r3,r3,r10,lsl#25 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 mov r9,r11,lsr#19 mov r10,r12,lsr#19 eor r9,r9,r12,lsl#13 eor r10,r10,r11,lsl#13 eor r9,r9,r12,lsr#29 eor r10,r10,r11,lsr#29 eor r9,r9,r11,lsl#3 eor r10,r10,r12,lsl#3 eor r9,r9,r11,lsr#6 eor r10,r10,r12,lsr#6 ldr r11,[sp,#120+0] eor r9,r9,r12,lsl#26 ldr r12,[sp,#120+4] adds r3,r3,r9 ldr r9,[sp,#192+0] adc r4,r4,r10 ldr r10,[sp,#192+4] adds r3,r3,r11 adc r4,r4,r12 adds r3,r3,r9 adc r4,r4,r10 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 mov r9,r7,lsr#14 str r3,[sp,#64+0] mov r10,r8,lsr#14 str r4,[sp,#64+4] eor r9,r9,r8,lsl#18 ldr r11,[sp,#56+0] @ h.lo eor r10,r10,r7,lsl#18 ldr r12,[sp,#56+4] @ h.hi eor r9,r9,r7,lsr#18 eor r10,r10,r8,lsr#18 eor r9,r9,r8,lsl#14 eor r10,r10,r7,lsl#14 eor r9,r9,r8,lsr#9 eor r10,r10,r7,lsr#9 eor r9,r9,r7,lsl#23 eor r10,r10,r8,lsl#23 @ Sigma1(e) adds r3,r3,r9 ldr r9,[sp,#40+0] @ f.lo adc r4,r4,r10 @ T += Sigma1(e) ldr r10,[sp,#40+4] @ f.hi adds r3,r3,r11 ldr r11,[sp,#48+0] @ g.lo adc r4,r4,r12 @ T += h ldr r12,[sp,#48+4] @ g.hi eor r9,r9,r11 str r7,[sp,#32+0] eor r10,r10,r12 str r8,[sp,#32+4] and r9,r9,r7 str r5,[sp,#0+0] and r10,r10,r8 str r6,[sp,#0+4] eor r9,r9,r11 ldr r11,[r14,#LO] @ K[i].lo eor r10,r10,r12 @ Ch(e,f,g) ldr r12,[r14,#HI] @ K[i].hi adds r3,r3,r9 ldr r7,[sp,#24+0] @ d.lo adc r4,r4,r10 @ T += Ch(e,f,g) ldr r8,[sp,#24+4] @ d.hi adds r3,r3,r11 and r9,r11,#0xff adc r4,r4,r12 @ T += K[i] adds r7,r7,r3 ldr r11,[sp,#8+0] @ b.lo adc r8,r8,r4 @ d += T teq r9,#23 ldr r12,[sp,#16+0] @ c.lo #ifdef __thumb2__ it eq @ Thumb2 thing, sanity check in ARM #endif orreq r14,r14,#1 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 mov r9,r5,lsr#28 mov r10,r6,lsr#28 eor r9,r9,r6,lsl#4 eor r10,r10,r5,lsl#4 eor r9,r9,r6,lsr#2 eor r10,r10,r5,lsr#2 eor r9,r9,r5,lsl#30 eor r10,r10,r6,lsl#30 eor r9,r9,r6,lsr#7 eor r10,r10,r5,lsr#7 eor r9,r9,r5,lsl#25 eor r10,r10,r6,lsl#25 @ Sigma0(a) adds r3,r3,r9 and r9,r5,r11 adc r4,r4,r10 @ T += Sigma0(a) ldr r10,[sp,#8+4] @ b.hi orr r5,r5,r11 ldr r11,[sp,#16+4] @ c.hi and r5,r5,r12 and r12,r6,r10 orr r6,r6,r10 orr r5,r5,r9 @ Maj(a,b,c).lo and r6,r6,r11 adds r5,r5,r3 orr r6,r6,r12 @ Maj(a,b,c).hi sub sp,sp,#8 adc r6,r6,r4 @ h += T tst r14,#1 add r14,r14,#8 #ifdef __thumb2__ ittt eq @ Thumb2 thing, sanity check in ARM #endif ldreq r9,[sp,#184+0] ldreq r10,[sp,#184+4] beq .L16_79 bic r14,r14,#1 ldr r3,[sp,#8+0] ldr r4,[sp,#8+4] ldr r9, [r0,#0+LO] ldr r10, [r0,#0+HI] ldr r11, [r0,#8+LO] ldr r12, [r0,#8+HI] adds r9,r5,r9 str r9, [r0,#0+LO] adc r10,r6,r10 str r10, [r0,#0+HI] adds r11,r3,r11 str r11, [r0,#8+LO] adc r12,r4,r12 str r12, [r0,#8+HI] ldr r5,[sp,#16+0] ldr r6,[sp,#16+4] ldr r3,[sp,#24+0] ldr r4,[sp,#24+4] ldr r9, [r0,#16+LO] ldr r10, [r0,#16+HI] ldr r11, [r0,#24+LO] ldr r12, [r0,#24+HI] adds r9,r5,r9 str r9, [r0,#16+LO] adc r10,r6,r10 str r10, [r0,#16+HI] adds r11,r3,r11 str r11, [r0,#24+LO] adc r12,r4,r12 str r12, [r0,#24+HI] ldr r3,[sp,#40+0] ldr r4,[sp,#40+4] ldr r9, [r0,#32+LO] ldr r10, [r0,#32+HI] ldr r11, [r0,#40+LO] ldr r12, [r0,#40+HI] adds r7,r7,r9 str r7,[r0,#32+LO] adc r8,r8,r10 str r8,[r0,#32+HI] adds r11,r3,r11 str r11, [r0,#40+LO] adc r12,r4,r12 str r12, [r0,#40+HI] ldr r5,[sp,#48+0] ldr r6,[sp,#48+4] ldr r3,[sp,#56+0] ldr r4,[sp,#56+4] ldr r9, [r0,#48+LO] ldr r10, [r0,#48+HI] ldr r11, [r0,#56+LO] ldr r12, [r0,#56+HI] adds r9,r5,r9 str r9, [r0,#48+LO] adc r10,r6,r10 str r10, [r0,#48+HI] adds r11,r3,r11 str r11, [r0,#56+LO] adc r12,r4,r12 str r12, [r0,#56+HI] add sp,sp,#640 sub r14,r14,#640 teq r1,r2 bne .Loop add sp,sp,#8*9 @ destroy frame #if __ARM_ARCH__>=5 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} #else ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) #endif .size zfs_sha512_block_armv7,.-zfs_sha512_block_armv7 .arch armv7-a .fpu neon .globl zfs_sha512_block_neon .type zfs_sha512_block_neon,%function .align 4 zfs_sha512_block_neon: .LNEON: dmb @ errata #451034 on early Cortex A8 add r2,r1,r2,lsl#7 @ len to point at the end of inp adr r3,K512 VFP_ABI_PUSH vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context .Loop_neon: vshr.u64 d24,d20,#14 @ 0 #if 0<16 vld1.64 {d0},[r1]! @ handles unaligned #endif vshr.u64 d25,d20,#18 #if 0>0 vadd.i64 d16,d30 @ h+=Maj from the past #endif vshr.u64 d26,d20,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d20,#50 vsli.64 d25,d20,#46 vmov d29,d20 vsli.64 d26,d20,#23 #if 0<16 && defined(__ARMEL__) vrev64.8 d0,d0 #endif veor d25,d24 vbsl d29,d21,d22 @ Ch(e,f,g) vshr.u64 d24,d16,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d23 vshr.u64 d25,d16,#34 vsli.64 d24,d16,#36 vadd.i64 d27,d26 vshr.u64 d26,d16,#39 vadd.i64 d28,d0 vsli.64 d25,d16,#30 veor d30,d16,d17 vsli.64 d26,d16,#25 veor d23,d24,d25 vadd.i64 d27,d28 vbsl d30,d18,d17 @ Maj(a,b,c) veor d23,d26 @ Sigma0(a) vadd.i64 d19,d27 vadd.i64 d30,d27 @ vadd.i64 d23,d30 vshr.u64 d24,d19,#14 @ 1 #if 1<16 vld1.64 {d1},[r1]! @ handles unaligned #endif vshr.u64 d25,d19,#18 #if 1>0 vadd.i64 d23,d30 @ h+=Maj from the past #endif vshr.u64 d26,d19,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d19,#50 vsli.64 d25,d19,#46 vmov d29,d19 vsli.64 d26,d19,#23 #if 1<16 && defined(__ARMEL__) vrev64.8 d1,d1 #endif veor d25,d24 vbsl d29,d20,d21 @ Ch(e,f,g) vshr.u64 d24,d23,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d22 vshr.u64 d25,d23,#34 vsli.64 d24,d23,#36 vadd.i64 d27,d26 vshr.u64 d26,d23,#39 vadd.i64 d28,d1 vsli.64 d25,d23,#30 veor d30,d23,d16 vsli.64 d26,d23,#25 veor d22,d24,d25 vadd.i64 d27,d28 vbsl d30,d17,d16 @ Maj(a,b,c) veor d22,d26 @ Sigma0(a) vadd.i64 d18,d27 vadd.i64 d30,d27 @ vadd.i64 d22,d30 vshr.u64 d24,d18,#14 @ 2 #if 2<16 vld1.64 {d2},[r1]! @ handles unaligned #endif vshr.u64 d25,d18,#18 #if 2>0 vadd.i64 d22,d30 @ h+=Maj from the past #endif vshr.u64 d26,d18,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d18,#50 vsli.64 d25,d18,#46 vmov d29,d18 vsli.64 d26,d18,#23 #if 2<16 && defined(__ARMEL__) vrev64.8 d2,d2 #endif veor d25,d24 vbsl d29,d19,d20 @ Ch(e,f,g) vshr.u64 d24,d22,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d21 vshr.u64 d25,d22,#34 vsli.64 d24,d22,#36 vadd.i64 d27,d26 vshr.u64 d26,d22,#39 vadd.i64 d28,d2 vsli.64 d25,d22,#30 veor d30,d22,d23 vsli.64 d26,d22,#25 veor d21,d24,d25 vadd.i64 d27,d28 vbsl d30,d16,d23 @ Maj(a,b,c) veor d21,d26 @ Sigma0(a) vadd.i64 d17,d27 vadd.i64 d30,d27 @ vadd.i64 d21,d30 vshr.u64 d24,d17,#14 @ 3 #if 3<16 vld1.64 {d3},[r1]! @ handles unaligned #endif vshr.u64 d25,d17,#18 #if 3>0 vadd.i64 d21,d30 @ h+=Maj from the past #endif vshr.u64 d26,d17,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d17,#50 vsli.64 d25,d17,#46 vmov d29,d17 vsli.64 d26,d17,#23 #if 3<16 && defined(__ARMEL__) vrev64.8 d3,d3 #endif veor d25,d24 vbsl d29,d18,d19 @ Ch(e,f,g) vshr.u64 d24,d21,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d20 vshr.u64 d25,d21,#34 vsli.64 d24,d21,#36 vadd.i64 d27,d26 vshr.u64 d26,d21,#39 vadd.i64 d28,d3 vsli.64 d25,d21,#30 veor d30,d21,d22 vsli.64 d26,d21,#25 veor d20,d24,d25 vadd.i64 d27,d28 vbsl d30,d23,d22 @ Maj(a,b,c) veor d20,d26 @ Sigma0(a) vadd.i64 d16,d27 vadd.i64 d30,d27 @ vadd.i64 d20,d30 vshr.u64 d24,d16,#14 @ 4 #if 4<16 vld1.64 {d4},[r1]! @ handles unaligned #endif vshr.u64 d25,d16,#18 #if 4>0 vadd.i64 d20,d30 @ h+=Maj from the past #endif vshr.u64 d26,d16,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d16,#50 vsli.64 d25,d16,#46 vmov d29,d16 vsli.64 d26,d16,#23 #if 4<16 && defined(__ARMEL__) vrev64.8 d4,d4 #endif veor d25,d24 vbsl d29,d17,d18 @ Ch(e,f,g) vshr.u64 d24,d20,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d19 vshr.u64 d25,d20,#34 vsli.64 d24,d20,#36 vadd.i64 d27,d26 vshr.u64 d26,d20,#39 vadd.i64 d28,d4 vsli.64 d25,d20,#30 veor d30,d20,d21 vsli.64 d26,d20,#25 veor d19,d24,d25 vadd.i64 d27,d28 vbsl d30,d22,d21 @ Maj(a,b,c) veor d19,d26 @ Sigma0(a) vadd.i64 d23,d27 vadd.i64 d30,d27 @ vadd.i64 d19,d30 vshr.u64 d24,d23,#14 @ 5 #if 5<16 vld1.64 {d5},[r1]! @ handles unaligned #endif vshr.u64 d25,d23,#18 #if 5>0 vadd.i64 d19,d30 @ h+=Maj from the past #endif vshr.u64 d26,d23,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d23,#50 vsli.64 d25,d23,#46 vmov d29,d23 vsli.64 d26,d23,#23 #if 5<16 && defined(__ARMEL__) vrev64.8 d5,d5 #endif veor d25,d24 vbsl d29,d16,d17 @ Ch(e,f,g) vshr.u64 d24,d19,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d18 vshr.u64 d25,d19,#34 vsli.64 d24,d19,#36 vadd.i64 d27,d26 vshr.u64 d26,d19,#39 vadd.i64 d28,d5 vsli.64 d25,d19,#30 veor d30,d19,d20 vsli.64 d26,d19,#25 veor d18,d24,d25 vadd.i64 d27,d28 vbsl d30,d21,d20 @ Maj(a,b,c) veor d18,d26 @ Sigma0(a) vadd.i64 d22,d27 vadd.i64 d30,d27 @ vadd.i64 d18,d30 vshr.u64 d24,d22,#14 @ 6 #if 6<16 vld1.64 {d6},[r1]! @ handles unaligned #endif vshr.u64 d25,d22,#18 #if 6>0 vadd.i64 d18,d30 @ h+=Maj from the past #endif vshr.u64 d26,d22,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d22,#50 vsli.64 d25,d22,#46 vmov d29,d22 vsli.64 d26,d22,#23 #if 6<16 && defined(__ARMEL__) vrev64.8 d6,d6 #endif veor d25,d24 vbsl d29,d23,d16 @ Ch(e,f,g) vshr.u64 d24,d18,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d17 vshr.u64 d25,d18,#34 vsli.64 d24,d18,#36 vadd.i64 d27,d26 vshr.u64 d26,d18,#39 vadd.i64 d28,d6 vsli.64 d25,d18,#30 veor d30,d18,d19 vsli.64 d26,d18,#25 veor d17,d24,d25 vadd.i64 d27,d28 vbsl d30,d20,d19 @ Maj(a,b,c) veor d17,d26 @ Sigma0(a) vadd.i64 d21,d27 vadd.i64 d30,d27 @ vadd.i64 d17,d30 vshr.u64 d24,d21,#14 @ 7 #if 7<16 vld1.64 {d7},[r1]! @ handles unaligned #endif vshr.u64 d25,d21,#18 #if 7>0 vadd.i64 d17,d30 @ h+=Maj from the past #endif vshr.u64 d26,d21,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d21,#50 vsli.64 d25,d21,#46 vmov d29,d21 vsli.64 d26,d21,#23 #if 7<16 && defined(__ARMEL__) vrev64.8 d7,d7 #endif veor d25,d24 vbsl d29,d22,d23 @ Ch(e,f,g) vshr.u64 d24,d17,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d16 vshr.u64 d25,d17,#34 vsli.64 d24,d17,#36 vadd.i64 d27,d26 vshr.u64 d26,d17,#39 vadd.i64 d28,d7 vsli.64 d25,d17,#30 veor d30,d17,d18 vsli.64 d26,d17,#25 veor d16,d24,d25 vadd.i64 d27,d28 vbsl d30,d19,d18 @ Maj(a,b,c) veor d16,d26 @ Sigma0(a) vadd.i64 d20,d27 vadd.i64 d30,d27 @ vadd.i64 d16,d30 vshr.u64 d24,d20,#14 @ 8 #if 8<16 vld1.64 {d8},[r1]! @ handles unaligned #endif vshr.u64 d25,d20,#18 #if 8>0 vadd.i64 d16,d30 @ h+=Maj from the past #endif vshr.u64 d26,d20,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d20,#50 vsli.64 d25,d20,#46 vmov d29,d20 vsli.64 d26,d20,#23 #if 8<16 && defined(__ARMEL__) vrev64.8 d8,d8 #endif veor d25,d24 vbsl d29,d21,d22 @ Ch(e,f,g) vshr.u64 d24,d16,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d23 vshr.u64 d25,d16,#34 vsli.64 d24,d16,#36 vadd.i64 d27,d26 vshr.u64 d26,d16,#39 vadd.i64 d28,d8 vsli.64 d25,d16,#30 veor d30,d16,d17 vsli.64 d26,d16,#25 veor d23,d24,d25 vadd.i64 d27,d28 vbsl d30,d18,d17 @ Maj(a,b,c) veor d23,d26 @ Sigma0(a) vadd.i64 d19,d27 vadd.i64 d30,d27 @ vadd.i64 d23,d30 vshr.u64 d24,d19,#14 @ 9 #if 9<16 vld1.64 {d9},[r1]! @ handles unaligned #endif vshr.u64 d25,d19,#18 #if 9>0 vadd.i64 d23,d30 @ h+=Maj from the past #endif vshr.u64 d26,d19,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d19,#50 vsli.64 d25,d19,#46 vmov d29,d19 vsli.64 d26,d19,#23 #if 9<16 && defined(__ARMEL__) vrev64.8 d9,d9 #endif veor d25,d24 vbsl d29,d20,d21 @ Ch(e,f,g) vshr.u64 d24,d23,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d22 vshr.u64 d25,d23,#34 vsli.64 d24,d23,#36 vadd.i64 d27,d26 vshr.u64 d26,d23,#39 vadd.i64 d28,d9 vsli.64 d25,d23,#30 veor d30,d23,d16 vsli.64 d26,d23,#25 veor d22,d24,d25 vadd.i64 d27,d28 vbsl d30,d17,d16 @ Maj(a,b,c) veor d22,d26 @ Sigma0(a) vadd.i64 d18,d27 vadd.i64 d30,d27 @ vadd.i64 d22,d30 vshr.u64 d24,d18,#14 @ 10 #if 10<16 vld1.64 {d10},[r1]! @ handles unaligned #endif vshr.u64 d25,d18,#18 #if 10>0 vadd.i64 d22,d30 @ h+=Maj from the past #endif vshr.u64 d26,d18,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d18,#50 vsli.64 d25,d18,#46 vmov d29,d18 vsli.64 d26,d18,#23 #if 10<16 && defined(__ARMEL__) vrev64.8 d10,d10 #endif veor d25,d24 vbsl d29,d19,d20 @ Ch(e,f,g) vshr.u64 d24,d22,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d21 vshr.u64 d25,d22,#34 vsli.64 d24,d22,#36 vadd.i64 d27,d26 vshr.u64 d26,d22,#39 vadd.i64 d28,d10 vsli.64 d25,d22,#30 veor d30,d22,d23 vsli.64 d26,d22,#25 veor d21,d24,d25 vadd.i64 d27,d28 vbsl d30,d16,d23 @ Maj(a,b,c) veor d21,d26 @ Sigma0(a) vadd.i64 d17,d27 vadd.i64 d30,d27 @ vadd.i64 d21,d30 vshr.u64 d24,d17,#14 @ 11 #if 11<16 vld1.64 {d11},[r1]! @ handles unaligned #endif vshr.u64 d25,d17,#18 #if 11>0 vadd.i64 d21,d30 @ h+=Maj from the past #endif vshr.u64 d26,d17,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d17,#50 vsli.64 d25,d17,#46 vmov d29,d17 vsli.64 d26,d17,#23 #if 11<16 && defined(__ARMEL__) vrev64.8 d11,d11 #endif veor d25,d24 vbsl d29,d18,d19 @ Ch(e,f,g) vshr.u64 d24,d21,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d20 vshr.u64 d25,d21,#34 vsli.64 d24,d21,#36 vadd.i64 d27,d26 vshr.u64 d26,d21,#39 vadd.i64 d28,d11 vsli.64 d25,d21,#30 veor d30,d21,d22 vsli.64 d26,d21,#25 veor d20,d24,d25 vadd.i64 d27,d28 vbsl d30,d23,d22 @ Maj(a,b,c) veor d20,d26 @ Sigma0(a) vadd.i64 d16,d27 vadd.i64 d30,d27 @ vadd.i64 d20,d30 vshr.u64 d24,d16,#14 @ 12 #if 12<16 vld1.64 {d12},[r1]! @ handles unaligned #endif vshr.u64 d25,d16,#18 #if 12>0 vadd.i64 d20,d30 @ h+=Maj from the past #endif vshr.u64 d26,d16,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d16,#50 vsli.64 d25,d16,#46 vmov d29,d16 vsli.64 d26,d16,#23 #if 12<16 && defined(__ARMEL__) vrev64.8 d12,d12 #endif veor d25,d24 vbsl d29,d17,d18 @ Ch(e,f,g) vshr.u64 d24,d20,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d19 vshr.u64 d25,d20,#34 vsli.64 d24,d20,#36 vadd.i64 d27,d26 vshr.u64 d26,d20,#39 vadd.i64 d28,d12 vsli.64 d25,d20,#30 veor d30,d20,d21 vsli.64 d26,d20,#25 veor d19,d24,d25 vadd.i64 d27,d28 vbsl d30,d22,d21 @ Maj(a,b,c) veor d19,d26 @ Sigma0(a) vadd.i64 d23,d27 vadd.i64 d30,d27 @ vadd.i64 d19,d30 vshr.u64 d24,d23,#14 @ 13 #if 13<16 vld1.64 {d13},[r1]! @ handles unaligned #endif vshr.u64 d25,d23,#18 #if 13>0 vadd.i64 d19,d30 @ h+=Maj from the past #endif vshr.u64 d26,d23,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d23,#50 vsli.64 d25,d23,#46 vmov d29,d23 vsli.64 d26,d23,#23 #if 13<16 && defined(__ARMEL__) vrev64.8 d13,d13 #endif veor d25,d24 vbsl d29,d16,d17 @ Ch(e,f,g) vshr.u64 d24,d19,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d18 vshr.u64 d25,d19,#34 vsli.64 d24,d19,#36 vadd.i64 d27,d26 vshr.u64 d26,d19,#39 vadd.i64 d28,d13 vsli.64 d25,d19,#30 veor d30,d19,d20 vsli.64 d26,d19,#25 veor d18,d24,d25 vadd.i64 d27,d28 vbsl d30,d21,d20 @ Maj(a,b,c) veor d18,d26 @ Sigma0(a) vadd.i64 d22,d27 vadd.i64 d30,d27 @ vadd.i64 d18,d30 vshr.u64 d24,d22,#14 @ 14 #if 14<16 vld1.64 {d14},[r1]! @ handles unaligned #endif vshr.u64 d25,d22,#18 #if 14>0 vadd.i64 d18,d30 @ h+=Maj from the past #endif vshr.u64 d26,d22,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d22,#50 vsli.64 d25,d22,#46 vmov d29,d22 vsli.64 d26,d22,#23 #if 14<16 && defined(__ARMEL__) vrev64.8 d14,d14 #endif veor d25,d24 vbsl d29,d23,d16 @ Ch(e,f,g) vshr.u64 d24,d18,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d17 vshr.u64 d25,d18,#34 vsli.64 d24,d18,#36 vadd.i64 d27,d26 vshr.u64 d26,d18,#39 vadd.i64 d28,d14 vsli.64 d25,d18,#30 veor d30,d18,d19 vsli.64 d26,d18,#25 veor d17,d24,d25 vadd.i64 d27,d28 vbsl d30,d20,d19 @ Maj(a,b,c) veor d17,d26 @ Sigma0(a) vadd.i64 d21,d27 vadd.i64 d30,d27 @ vadd.i64 d17,d30 vshr.u64 d24,d21,#14 @ 15 #if 15<16 vld1.64 {d15},[r1]! @ handles unaligned #endif vshr.u64 d25,d21,#18 #if 15>0 vadd.i64 d17,d30 @ h+=Maj from the past #endif vshr.u64 d26,d21,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d21,#50 vsli.64 d25,d21,#46 vmov d29,d21 vsli.64 d26,d21,#23 #if 15<16 && defined(__ARMEL__) vrev64.8 d15,d15 #endif veor d25,d24 vbsl d29,d22,d23 @ Ch(e,f,g) vshr.u64 d24,d17,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d16 vshr.u64 d25,d17,#34 vsli.64 d24,d17,#36 vadd.i64 d27,d26 vshr.u64 d26,d17,#39 vadd.i64 d28,d15 vsli.64 d25,d17,#30 veor d30,d17,d18 vsli.64 d26,d17,#25 veor d16,d24,d25 vadd.i64 d27,d28 vbsl d30,d19,d18 @ Maj(a,b,c) veor d16,d26 @ Sigma0(a) vadd.i64 d20,d27 vadd.i64 d30,d27 @ vadd.i64 d16,d30 mov r12,#4 .L16_79_neon: subs r12,#1 vshr.u64 q12,q7,#19 vshr.u64 q13,q7,#61 vadd.i64 d16,d30 @ h+=Maj from the past vshr.u64 q15,q7,#6 vsli.64 q12,q7,#45 vext.8 q14,q0,q1,#8 @ X[i+1] vsli.64 q13,q7,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q0,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q4,q5,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d20,#14 @ from NEON_00_15 vadd.i64 q0,q14 vshr.u64 d25,d20,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d20,#41 @ from NEON_00_15 vadd.i64 q0,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d20,#50 vsli.64 d25,d20,#46 vmov d29,d20 vsli.64 d26,d20,#23 #if 16<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d21,d22 @ Ch(e,f,g) vshr.u64 d24,d16,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d23 vshr.u64 d25,d16,#34 vsli.64 d24,d16,#36 vadd.i64 d27,d26 vshr.u64 d26,d16,#39 vadd.i64 d28,d0 vsli.64 d25,d16,#30 veor d30,d16,d17 vsli.64 d26,d16,#25 veor d23,d24,d25 vadd.i64 d27,d28 vbsl d30,d18,d17 @ Maj(a,b,c) veor d23,d26 @ Sigma0(a) vadd.i64 d19,d27 vadd.i64 d30,d27 @ vadd.i64 d23,d30 vshr.u64 d24,d19,#14 @ 17 #if 17<16 vld1.64 {d1},[r1]! @ handles unaligned #endif vshr.u64 d25,d19,#18 #if 17>0 vadd.i64 d23,d30 @ h+=Maj from the past #endif vshr.u64 d26,d19,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d19,#50 vsli.64 d25,d19,#46 vmov d29,d19 vsli.64 d26,d19,#23 #if 17<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d20,d21 @ Ch(e,f,g) vshr.u64 d24,d23,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d22 vshr.u64 d25,d23,#34 vsli.64 d24,d23,#36 vadd.i64 d27,d26 vshr.u64 d26,d23,#39 vadd.i64 d28,d1 vsli.64 d25,d23,#30 veor d30,d23,d16 vsli.64 d26,d23,#25 veor d22,d24,d25 vadd.i64 d27,d28 vbsl d30,d17,d16 @ Maj(a,b,c) veor d22,d26 @ Sigma0(a) vadd.i64 d18,d27 vadd.i64 d30,d27 @ vadd.i64 d22,d30 vshr.u64 q12,q0,#19 vshr.u64 q13,q0,#61 vadd.i64 d22,d30 @ h+=Maj from the past vshr.u64 q15,q0,#6 vsli.64 q12,q0,#45 vext.8 q14,q1,q2,#8 @ X[i+1] vsli.64 q13,q0,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q1,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q5,q6,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d18,#14 @ from NEON_00_15 vadd.i64 q1,q14 vshr.u64 d25,d18,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d18,#41 @ from NEON_00_15 vadd.i64 q1,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d18,#50 vsli.64 d25,d18,#46 vmov d29,d18 vsli.64 d26,d18,#23 #if 18<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d19,d20 @ Ch(e,f,g) vshr.u64 d24,d22,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d21 vshr.u64 d25,d22,#34 vsli.64 d24,d22,#36 vadd.i64 d27,d26 vshr.u64 d26,d22,#39 vadd.i64 d28,d2 vsli.64 d25,d22,#30 veor d30,d22,d23 vsli.64 d26,d22,#25 veor d21,d24,d25 vadd.i64 d27,d28 vbsl d30,d16,d23 @ Maj(a,b,c) veor d21,d26 @ Sigma0(a) vadd.i64 d17,d27 vadd.i64 d30,d27 @ vadd.i64 d21,d30 vshr.u64 d24,d17,#14 @ 19 #if 19<16 vld1.64 {d3},[r1]! @ handles unaligned #endif vshr.u64 d25,d17,#18 #if 19>0 vadd.i64 d21,d30 @ h+=Maj from the past #endif vshr.u64 d26,d17,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d17,#50 vsli.64 d25,d17,#46 vmov d29,d17 vsli.64 d26,d17,#23 #if 19<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d18,d19 @ Ch(e,f,g) vshr.u64 d24,d21,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d20 vshr.u64 d25,d21,#34 vsli.64 d24,d21,#36 vadd.i64 d27,d26 vshr.u64 d26,d21,#39 vadd.i64 d28,d3 vsli.64 d25,d21,#30 veor d30,d21,d22 vsli.64 d26,d21,#25 veor d20,d24,d25 vadd.i64 d27,d28 vbsl d30,d23,d22 @ Maj(a,b,c) veor d20,d26 @ Sigma0(a) vadd.i64 d16,d27 vadd.i64 d30,d27 @ vadd.i64 d20,d30 vshr.u64 q12,q1,#19 vshr.u64 q13,q1,#61 vadd.i64 d20,d30 @ h+=Maj from the past vshr.u64 q15,q1,#6 vsli.64 q12,q1,#45 vext.8 q14,q2,q3,#8 @ X[i+1] vsli.64 q13,q1,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q2,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q6,q7,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d16,#14 @ from NEON_00_15 vadd.i64 q2,q14 vshr.u64 d25,d16,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d16,#41 @ from NEON_00_15 vadd.i64 q2,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d16,#50 vsli.64 d25,d16,#46 vmov d29,d16 vsli.64 d26,d16,#23 #if 20<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d17,d18 @ Ch(e,f,g) vshr.u64 d24,d20,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d19 vshr.u64 d25,d20,#34 vsli.64 d24,d20,#36 vadd.i64 d27,d26 vshr.u64 d26,d20,#39 vadd.i64 d28,d4 vsli.64 d25,d20,#30 veor d30,d20,d21 vsli.64 d26,d20,#25 veor d19,d24,d25 vadd.i64 d27,d28 vbsl d30,d22,d21 @ Maj(a,b,c) veor d19,d26 @ Sigma0(a) vadd.i64 d23,d27 vadd.i64 d30,d27 @ vadd.i64 d19,d30 vshr.u64 d24,d23,#14 @ 21 #if 21<16 vld1.64 {d5},[r1]! @ handles unaligned #endif vshr.u64 d25,d23,#18 #if 21>0 vadd.i64 d19,d30 @ h+=Maj from the past #endif vshr.u64 d26,d23,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d23,#50 vsli.64 d25,d23,#46 vmov d29,d23 vsli.64 d26,d23,#23 #if 21<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d16,d17 @ Ch(e,f,g) vshr.u64 d24,d19,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d18 vshr.u64 d25,d19,#34 vsli.64 d24,d19,#36 vadd.i64 d27,d26 vshr.u64 d26,d19,#39 vadd.i64 d28,d5 vsli.64 d25,d19,#30 veor d30,d19,d20 vsli.64 d26,d19,#25 veor d18,d24,d25 vadd.i64 d27,d28 vbsl d30,d21,d20 @ Maj(a,b,c) veor d18,d26 @ Sigma0(a) vadd.i64 d22,d27 vadd.i64 d30,d27 @ vadd.i64 d18,d30 vshr.u64 q12,q2,#19 vshr.u64 q13,q2,#61 vadd.i64 d18,d30 @ h+=Maj from the past vshr.u64 q15,q2,#6 vsli.64 q12,q2,#45 vext.8 q14,q3,q4,#8 @ X[i+1] vsli.64 q13,q2,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q3,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q7,q0,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d22,#14 @ from NEON_00_15 vadd.i64 q3,q14 vshr.u64 d25,d22,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d22,#41 @ from NEON_00_15 vadd.i64 q3,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d22,#50 vsli.64 d25,d22,#46 vmov d29,d22 vsli.64 d26,d22,#23 #if 22<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d23,d16 @ Ch(e,f,g) vshr.u64 d24,d18,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d17 vshr.u64 d25,d18,#34 vsli.64 d24,d18,#36 vadd.i64 d27,d26 vshr.u64 d26,d18,#39 vadd.i64 d28,d6 vsli.64 d25,d18,#30 veor d30,d18,d19 vsli.64 d26,d18,#25 veor d17,d24,d25 vadd.i64 d27,d28 vbsl d30,d20,d19 @ Maj(a,b,c) veor d17,d26 @ Sigma0(a) vadd.i64 d21,d27 vadd.i64 d30,d27 @ vadd.i64 d17,d30 vshr.u64 d24,d21,#14 @ 23 #if 23<16 vld1.64 {d7},[r1]! @ handles unaligned #endif vshr.u64 d25,d21,#18 #if 23>0 vadd.i64 d17,d30 @ h+=Maj from the past #endif vshr.u64 d26,d21,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d21,#50 vsli.64 d25,d21,#46 vmov d29,d21 vsli.64 d26,d21,#23 #if 23<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d22,d23 @ Ch(e,f,g) vshr.u64 d24,d17,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d16 vshr.u64 d25,d17,#34 vsli.64 d24,d17,#36 vadd.i64 d27,d26 vshr.u64 d26,d17,#39 vadd.i64 d28,d7 vsli.64 d25,d17,#30 veor d30,d17,d18 vsli.64 d26,d17,#25 veor d16,d24,d25 vadd.i64 d27,d28 vbsl d30,d19,d18 @ Maj(a,b,c) veor d16,d26 @ Sigma0(a) vadd.i64 d20,d27 vadd.i64 d30,d27 @ vadd.i64 d16,d30 vshr.u64 q12,q3,#19 vshr.u64 q13,q3,#61 vadd.i64 d16,d30 @ h+=Maj from the past vshr.u64 q15,q3,#6 vsli.64 q12,q3,#45 vext.8 q14,q4,q5,#8 @ X[i+1] vsli.64 q13,q3,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q4,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q0,q1,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d20,#14 @ from NEON_00_15 vadd.i64 q4,q14 vshr.u64 d25,d20,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d20,#41 @ from NEON_00_15 vadd.i64 q4,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d20,#50 vsli.64 d25,d20,#46 vmov d29,d20 vsli.64 d26,d20,#23 #if 24<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d21,d22 @ Ch(e,f,g) vshr.u64 d24,d16,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d23 vshr.u64 d25,d16,#34 vsli.64 d24,d16,#36 vadd.i64 d27,d26 vshr.u64 d26,d16,#39 vadd.i64 d28,d8 vsli.64 d25,d16,#30 veor d30,d16,d17 vsli.64 d26,d16,#25 veor d23,d24,d25 vadd.i64 d27,d28 vbsl d30,d18,d17 @ Maj(a,b,c) veor d23,d26 @ Sigma0(a) vadd.i64 d19,d27 vadd.i64 d30,d27 @ vadd.i64 d23,d30 vshr.u64 d24,d19,#14 @ 25 #if 25<16 vld1.64 {d9},[r1]! @ handles unaligned #endif vshr.u64 d25,d19,#18 #if 25>0 vadd.i64 d23,d30 @ h+=Maj from the past #endif vshr.u64 d26,d19,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d19,#50 vsli.64 d25,d19,#46 vmov d29,d19 vsli.64 d26,d19,#23 #if 25<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d20,d21 @ Ch(e,f,g) vshr.u64 d24,d23,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d22 vshr.u64 d25,d23,#34 vsli.64 d24,d23,#36 vadd.i64 d27,d26 vshr.u64 d26,d23,#39 vadd.i64 d28,d9 vsli.64 d25,d23,#30 veor d30,d23,d16 vsli.64 d26,d23,#25 veor d22,d24,d25 vadd.i64 d27,d28 vbsl d30,d17,d16 @ Maj(a,b,c) veor d22,d26 @ Sigma0(a) vadd.i64 d18,d27 vadd.i64 d30,d27 @ vadd.i64 d22,d30 vshr.u64 q12,q4,#19 vshr.u64 q13,q4,#61 vadd.i64 d22,d30 @ h+=Maj from the past vshr.u64 q15,q4,#6 vsli.64 q12,q4,#45 vext.8 q14,q5,q6,#8 @ X[i+1] vsli.64 q13,q4,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q5,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q1,q2,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d18,#14 @ from NEON_00_15 vadd.i64 q5,q14 vshr.u64 d25,d18,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d18,#41 @ from NEON_00_15 vadd.i64 q5,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d18,#50 vsli.64 d25,d18,#46 vmov d29,d18 vsli.64 d26,d18,#23 #if 26<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d19,d20 @ Ch(e,f,g) vshr.u64 d24,d22,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d21 vshr.u64 d25,d22,#34 vsli.64 d24,d22,#36 vadd.i64 d27,d26 vshr.u64 d26,d22,#39 vadd.i64 d28,d10 vsli.64 d25,d22,#30 veor d30,d22,d23 vsli.64 d26,d22,#25 veor d21,d24,d25 vadd.i64 d27,d28 vbsl d30,d16,d23 @ Maj(a,b,c) veor d21,d26 @ Sigma0(a) vadd.i64 d17,d27 vadd.i64 d30,d27 @ vadd.i64 d21,d30 vshr.u64 d24,d17,#14 @ 27 #if 27<16 vld1.64 {d11},[r1]! @ handles unaligned #endif vshr.u64 d25,d17,#18 #if 27>0 vadd.i64 d21,d30 @ h+=Maj from the past #endif vshr.u64 d26,d17,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d17,#50 vsli.64 d25,d17,#46 vmov d29,d17 vsli.64 d26,d17,#23 #if 27<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d18,d19 @ Ch(e,f,g) vshr.u64 d24,d21,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d20 vshr.u64 d25,d21,#34 vsli.64 d24,d21,#36 vadd.i64 d27,d26 vshr.u64 d26,d21,#39 vadd.i64 d28,d11 vsli.64 d25,d21,#30 veor d30,d21,d22 vsli.64 d26,d21,#25 veor d20,d24,d25 vadd.i64 d27,d28 vbsl d30,d23,d22 @ Maj(a,b,c) veor d20,d26 @ Sigma0(a) vadd.i64 d16,d27 vadd.i64 d30,d27 @ vadd.i64 d20,d30 vshr.u64 q12,q5,#19 vshr.u64 q13,q5,#61 vadd.i64 d20,d30 @ h+=Maj from the past vshr.u64 q15,q5,#6 vsli.64 q12,q5,#45 vext.8 q14,q6,q7,#8 @ X[i+1] vsli.64 q13,q5,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q6,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q2,q3,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d16,#14 @ from NEON_00_15 vadd.i64 q6,q14 vshr.u64 d25,d16,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d16,#41 @ from NEON_00_15 vadd.i64 q6,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d16,#50 vsli.64 d25,d16,#46 vmov d29,d16 vsli.64 d26,d16,#23 #if 28<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d17,d18 @ Ch(e,f,g) vshr.u64 d24,d20,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d19 vshr.u64 d25,d20,#34 vsli.64 d24,d20,#36 vadd.i64 d27,d26 vshr.u64 d26,d20,#39 vadd.i64 d28,d12 vsli.64 d25,d20,#30 veor d30,d20,d21 vsli.64 d26,d20,#25 veor d19,d24,d25 vadd.i64 d27,d28 vbsl d30,d22,d21 @ Maj(a,b,c) veor d19,d26 @ Sigma0(a) vadd.i64 d23,d27 vadd.i64 d30,d27 @ vadd.i64 d19,d30 vshr.u64 d24,d23,#14 @ 29 #if 29<16 vld1.64 {d13},[r1]! @ handles unaligned #endif vshr.u64 d25,d23,#18 #if 29>0 vadd.i64 d19,d30 @ h+=Maj from the past #endif vshr.u64 d26,d23,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d23,#50 vsli.64 d25,d23,#46 vmov d29,d23 vsli.64 d26,d23,#23 #if 29<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d16,d17 @ Ch(e,f,g) vshr.u64 d24,d19,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d18 vshr.u64 d25,d19,#34 vsli.64 d24,d19,#36 vadd.i64 d27,d26 vshr.u64 d26,d19,#39 vadd.i64 d28,d13 vsli.64 d25,d19,#30 veor d30,d19,d20 vsli.64 d26,d19,#25 veor d18,d24,d25 vadd.i64 d27,d28 vbsl d30,d21,d20 @ Maj(a,b,c) veor d18,d26 @ Sigma0(a) vadd.i64 d22,d27 vadd.i64 d30,d27 @ vadd.i64 d18,d30 vshr.u64 q12,q6,#19 vshr.u64 q13,q6,#61 vadd.i64 d18,d30 @ h+=Maj from the past vshr.u64 q15,q6,#6 vsli.64 q12,q6,#45 vext.8 q14,q7,q0,#8 @ X[i+1] vsli.64 q13,q6,#3 veor q15,q12 vshr.u64 q12,q14,#1 veor q15,q13 @ sigma1(X[i+14]) vshr.u64 q13,q14,#8 vadd.i64 q7,q15 vshr.u64 q15,q14,#7 vsli.64 q12,q14,#63 vsli.64 q13,q14,#56 vext.8 q14,q3,q4,#8 @ X[i+9] veor q15,q12 vshr.u64 d24,d22,#14 @ from NEON_00_15 vadd.i64 q7,q14 vshr.u64 d25,d22,#18 @ from NEON_00_15 veor q15,q13 @ sigma0(X[i+1]) vshr.u64 d26,d22,#41 @ from NEON_00_15 vadd.i64 q7,q15 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d22,#50 vsli.64 d25,d22,#46 vmov d29,d22 vsli.64 d26,d22,#23 #if 30<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d23,d16 @ Ch(e,f,g) vshr.u64 d24,d18,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d17 vshr.u64 d25,d18,#34 vsli.64 d24,d18,#36 vadd.i64 d27,d26 vshr.u64 d26,d18,#39 vadd.i64 d28,d14 vsli.64 d25,d18,#30 veor d30,d18,d19 vsli.64 d26,d18,#25 veor d17,d24,d25 vadd.i64 d27,d28 vbsl d30,d20,d19 @ Maj(a,b,c) veor d17,d26 @ Sigma0(a) vadd.i64 d21,d27 vadd.i64 d30,d27 @ vadd.i64 d17,d30 vshr.u64 d24,d21,#14 @ 31 #if 31<16 vld1.64 {d15},[r1]! @ handles unaligned #endif vshr.u64 d25,d21,#18 #if 31>0 vadd.i64 d17,d30 @ h+=Maj from the past #endif vshr.u64 d26,d21,#41 vld1.64 {d28},[r3,:64]! @ K[i++] vsli.64 d24,d21,#50 vsli.64 d25,d21,#46 vmov d29,d21 vsli.64 d26,d21,#23 #if 31<16 && defined(__ARMEL__) vrev64.8 , #endif veor d25,d24 vbsl d29,d22,d23 @ Ch(e,f,g) vshr.u64 d24,d17,#28 veor d26,d25 @ Sigma1(e) vadd.i64 d27,d29,d16 vshr.u64 d25,d17,#34 vsli.64 d24,d17,#36 vadd.i64 d27,d26 vshr.u64 d26,d17,#39 vadd.i64 d28,d15 vsli.64 d25,d17,#30 veor d30,d17,d18 vsli.64 d26,d17,#25 veor d16,d24,d25 vadd.i64 d27,d28 vbsl d30,d19,d18 @ Maj(a,b,c) veor d16,d26 @ Sigma0(a) vadd.i64 d20,d27 vadd.i64 d30,d27 @ vadd.i64 d16,d30 bne .L16_79_neon vadd.i64 d16,d30 @ h+=Maj from the past vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp vadd.i64 q8,q12 @ vectorized accumulate vadd.i64 q9,q13 vadd.i64 q10,q14 vadd.i64 q11,q15 vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context teq r1,r2 sub r3,#640 @ rewind K512 bne .Loop_neon VFP_ABI_POP bx lr @ .word 0xe12fff1e .size zfs_sha512_block_neon,.-zfs_sha512_block_neon #endif diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c index 74755eb6d9ef..2b62abcccb78 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c @@ -1,1826 +1,1831 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2017, Datto, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * This file is responsible for handling all of the details of generating * encryption parameters and performing encryption and authentication. * * BLOCK ENCRYPTION PARAMETERS: * Encryption /Authentication Algorithm Suite (crypt): * The encryption algorithm, mode, and key length we are going to use. We * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit * keys. All authentication is currently done with SHA512-HMAC. * * Plaintext: * The unencrypted data that we want to encrypt. * * Initialization Vector (IV): * An initialization vector for the encryption algorithms. This is used to * "tweak" the encryption algorithms so that two blocks of the same data are * encrypted into different ciphertext outputs, thus obfuscating block patterns. * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is * never reused with the same encryption key. This value is stored unencrypted * and must simply be provided to the decryption function. We use a 96 bit IV * (as recommended by NIST) for all block encryption. For non-dedup blocks we * derive the IV randomly. The first 64 bits of the IV are stored in the second * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of * level 0 blocks is the number of allocated dnodes in that block. The on-disk * format supports at most 2^15 slots per L0 dnode block, because the maximum * block size is 16MB (2^24). In either case, for level 0 blocks this number * will still be smaller than UINT32_MAX so it is safe to store the IV in the * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count * for the dnode code. * * Master key: * This is the most important secret data of an encrypted dataset. It is used * along with the salt to generate that actual encryption keys via HKDF. We * do not use the master key to directly encrypt any data because there are * theoretical limits on how much data can actually be safely encrypted with * any encryption mode. The master key is stored encrypted on disk with the * user's wrapping key. Its length is determined by the encryption algorithm. * For details on how this is stored see the block comment in dsl_crypt.c * * Salt: * Used as an input to the HKDF function, along with the master key. We use a * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt * can be used for encrypting many blocks, so we cache the current salt and the * associated derived key in zio_crypt_t so we do not need to derive it again * needlessly. * * Encryption Key: * A secret binary key, generated from an HKDF function used to encrypt and * decrypt data. * * Message Authentication Code (MAC) * The MAC is an output of authenticated encryption modes such as AES-GCM and * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted * data on disk and return garbage to the application. Effectively, it is a * checksum that can not be reproduced by an attacker. We store the MAC in the * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated * regular checksum of the ciphertext which can be used for scrubbing. * * OBJECT AUTHENTICATION: * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because * they contain some info that always needs to be readable. To prevent this * data from being altered, we authenticate this data using SHA512-HMAC. This * will produce a MAC (similar to the one produced via encryption) which can * be used to verify the object was not modified. HMACs do not require key * rotation or IVs, so we can keep up to the full 3 copies of authenticated * data. * * ZIL ENCRYPTION: * ZIL blocks have their bp written to disk ahead of the associated data, so we * cannot store the MAC there as we normally do. For these blocks the MAC is * stored in the embedded checksum within the zil_chain_t header. The salt and * IV are generated for the block on bp allocation instead of at encryption * time. In addition, ZIL blocks have some pieces that must be left in plaintext * for claiming even though all of the sensitive user data still needs to be * encrypted. The function zio_crypt_init_uios_zil() handles parsing which * pieces of the block need to be encrypted. All data that is not encrypted is * authenticated using the AAD mechanisms that the supported encryption modes * provide for. In order to preserve the semantics of the ZIL for encrypted * datasets, the ZIL is not protected at the objset level as described below. * * DNODE ENCRYPTION: * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left * in plaintext for scrubbing and claiming, but the bonus buffers might contain * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing * which pieces of the block need to be encrypted. For more details about * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). * * OBJECT SET AUTHENTICATION: * Up to this point, everything we have encrypted and authenticated has been * at level 0 (or -2 for the ZIL). If we did not do any further work the * on-disk format would be susceptible to attacks that deleted or rearranged * the order of level 0 blocks. Ideally, the cleanest solution would be to * maintain a tree of authentication MACs going up the bp tree. However, this * presents a problem for raw sends. Send files do not send information about * indirect blocks so there would be no convenient way to transfer the MACs and * they cannot be recalculated on the receive side without the master key which * would defeat one of the purposes of raw sends in the first place. Instead, * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs * from the level below. We also include some portable fields from blk_prop such * as the lsize and compression algorithm to prevent the data from being * misinterpreted. * * At the objset level, we maintain 2 separate 256 bit MACs in the * objset_phys_t. The first one is "portable" and is the logical root of the * MAC tree maintained in the metadnode's bps. The second, is "local" and is * used as the root MAC for the user accounting objects, which are also not * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload * of the send file. The useraccounting code ensures that the useraccounting * info is not present upon a receive, so the local MAC can simply be cleared * out at that time. For more info about objset_phys_t authentication, see * zio_crypt_do_objset_hmacs(). * * CONSIDERATIONS FOR DEDUP: * In order for dedup to work, blocks that we want to dedup with one another * need to use the same IV and encryption key, so that they will have the same * ciphertext. Normally, one should never reuse an IV with the same encryption * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both * blocks. In this case, however, since we are using the same plaintext as * well all that we end up with is a duplicate of the original ciphertext we * already had. As a result, an attacker with read access to the raw disk will * be able to tell which blocks are the same but this information is given away * by dedup anyway. In order to get the same IVs and encryption keys for * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC * here so that a reproducible checksum of the plaintext is never available to * the attacker. The HMAC key is kept alongside the master key, encrypted on * disk. The first 64 bits of the HMAC are used in place of the random salt, and * the next 96 bits are used as the IV. As a result of this mechanism, dedup * will only work within a clone family since encrypted dedup requires use of * the same master and HMAC keys. */ /* * After encrypting many blocks with the same key we may start to run up * against the theoretical limits of how much data can securely be encrypted * with a single key using the supported encryption modes. The most obvious * limitation is that our risk of generating 2 equivalent 96 bit IVs increases * the more IVs we generate (which both GCM and CCM modes strictly forbid). * This risk actually grows surprisingly quickly over time according to the * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have * generated n IVs with a cryptographically secure RNG, the approximate * probability p(n) of a collision is given as: * * p(n) ~= e^(-n*(n-1)/(2*(2^96))) * * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] * * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion * we must not write more than 398,065,730 blocks with the same encryption key. * Therefore, we rotate our keys after 400,000,000 blocks have been written by * generating a new random 64 bit salt for our HKDF encryption key generation * function. */ #define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 #define ZFS_CURRENT_MAX_SALT_USES \ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; typedef struct blkptr_auth_buf { uint64_t bab_prop; /* blk_prop - portable mask */ uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ uint64_t bab_pad; /* reserved for future use */ } blkptr_auth_buf_t; const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { {"", ZC_TYPE_NONE, 0, "inherit"}, {"", ZC_TYPE_NONE, 0, "on"}, {"", ZC_TYPE_NONE, 0, "off"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} }; static void zio_crypt_key_destroy_early(zio_crypt_key_t *key) { rw_destroy(&key->zk_salt_lock); /* free crypto templates */ memset(&key->zk_session, 0, sizeof (key->zk_session)); /* zero out sensitive data */ memset(key, 0, sizeof (zio_crypt_key_t)); } void zio_crypt_key_destroy(zio_crypt_key_t *key) { freebsd_crypt_freesession(&key->zk_session); zio_crypt_key_destroy_early(key); } int zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) { int ret; crypto_mechanism_t mech __unused; uint_t keydata_len; const zio_crypt_info_t *ci = NULL; ASSERT3P(key, !=, NULL); ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); ci = &zio_crypt_table[crypt]; if (ci->ci_crypt_type != ZC_TYPE_GCM && ci->ci_crypt_type != ZC_TYPE_CCM) return (ENOTSUP); keydata_len = zio_crypt_table[crypt].ci_keylen; memset(key, 0, sizeof (zio_crypt_key_t)); rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); /* fill keydata buffers and salt with random data */ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); if (ret != 0) goto error; ret = random_get_bytes(key->zk_master_keydata, keydata_len); if (ret != 0) goto error; ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); if (ret != 0) goto error; ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for the ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = &key->zk_hmac_key; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); ci = &zio_crypt_table[crypt]; if (ci->ci_crypt_type != ZC_TYPE_GCM && ci->ci_crypt_type != ZC_TYPE_CCM) return (ENOTSUP); ret = freebsd_crypt_newsession(&key->zk_session, ci, &key->zk_current_key); if (ret) goto error; key->zk_crypt = crypt; key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy_early(key); return (ret); } static int zio_crypt_key_change_salt(zio_crypt_key_t *key) { int ret = 0; uint8_t salt[ZIO_DATA_SALT_LEN]; crypto_mechanism_t mech __unused; uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; /* generate a new salt */ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; rw_enter(&key->zk_salt_lock, RW_WRITER); /* someone beat us to the salt rotation, just unlock and return */ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) goto out_unlock; /* derive the current key from the master key and the new salt */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto out_unlock; /* assign the salt and reset the usage count */ memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); key->zk_salt_count = 0; freebsd_crypt_freesession(&key->zk_session); ret = freebsd_crypt_newsession(&key->zk_session, &zio_crypt_table[key->zk_crypt], &key->zk_current_key); if (ret != 0) goto out_unlock; rw_exit(&key->zk_salt_lock); return (0); out_unlock: rw_exit(&key->zk_salt_lock); error: return (ret); } /* See comment above zfs_key_max_salt_uses definition for details */ int zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) { int ret; boolean_t salt_change; rw_enter(&key->zk_salt_lock, RW_READER); memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= ZFS_CURRENT_MAX_SALT_USES); rw_exit(&key->zk_salt_lock); if (salt_change) { ret = zio_crypt_key_change_salt(key); if (ret != 0) goto error; } return (0); error: return (ret); } void *failed_decrypt_buf; int failed_decrypt_size; /* * This function handles all encryption and decryption in zfs. When * encrypting it expects puio to reference the plaintext and cuio to * reference the ciphertext. cuio must have enough space for the * ciphertext + room for a MAC. datalen should be the length of the * plaintext / ciphertext alone. */ /* * The implementation for FreeBSD's OpenCrypto. * * The big difference between ICP and FOC is that FOC uses a single * buffer for input and output. This means that (for AES-GCM, the * only one supported right now) the source must be copied into the * destination, and the destination must have the AAD, and the tag/MAC, * already associated with it. (Both implementations can use a uio.) * * Since the auth data is part of the iovec array, all we need to know * is the length: 0 means there's no AAD. * */ static int zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess, uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen, zfs_uio_t *uio, uint_t auth_len) { const zio_crypt_info_t *ci = &zio_crypt_table[crypt]; if (ci->ci_crypt_type != ZC_TYPE_GCM && ci->ci_crypt_type != ZC_TYPE_CCM) return (ENOTSUP); int ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf, datalen, auth_len); if (ret != 0) { #ifdef FCRYPTO_DEBUG printf("%s(%d): Returning error %s\n", __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM"); #endif ret = SET_ERROR(encrypt ? EIO : ECKSUM); } return (ret); } int zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) { int ret; uint64_t aad[3]; /* * With OpenCrypto in FreeBSD, the same buffer is used for * input and output. Also, the AAD (for AES-GMC at least) * needs to logically go in front. */ zfs_uio_t cuio; struct uio cuio_s; iovec_t iovecs[4]; uint64_t crypt = key->zk_crypt; uint_t enc_len, keydata_len, aad_len; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); zfs_uio_init(&cuio, &cuio_s); keydata_len = zio_crypt_table[crypt].ci_keylen; /* generate iv for wrapping the master and hmac key */ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); if (ret != 0) goto error; /* * Since we only support one buffer, we need to copy * the plain text (source) to the cipher buffer (dest). * We set iovecs[0] -- the authentication data -- below. */ memcpy(keydata_out, key->zk_master_keydata, keydata_len); memcpy(hmac_keydata_out, key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); iovecs[1].iov_base = keydata_out; iovecs[1].iov_len = keydata_len; iovecs[2].iov_base = hmac_keydata_out; iovecs[2].iov_len = SHA512_HMAC_KEYLEN; iovecs[3].iov_base = mac; iovecs[3].iov_len = WRAPPING_MAC_LEN; /* * Although we don't support writing to the old format, we do * support rewrapping the key so that the user can move and * quarantine datasets on the old format. */ if (key->zk_version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(key->zk_guid); } else { ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(key->zk_guid); aad[1] = LE_64(crypt); aad[2] = LE_64(key->zk_version); } iovecs[0].iov_base = aad; iovecs[0].iov_len = aad_len; enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; GET_UIO_STRUCT(&cuio)->uio_iov = iovecs; zfs_uio_iovcnt(&cuio) = 4; zfs_uio_segflg(&cuio) = UIO_SYSSPACE; /* encrypt the keys and store the resulting ciphertext and mac */ ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey, iv, enc_len, &cuio, aad_len); if (ret != 0) goto error; return (0); error: return (ret); } int zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, uint8_t *mac, zio_crypt_key_t *key) { int ret; uint64_t aad[3]; /* * With OpenCrypto in FreeBSD, the same buffer is used for * input and output. Also, the AAD (for AES-GMC at least) * needs to logically go in front. */ zfs_uio_t cuio; struct uio cuio_s; iovec_t iovecs[4]; void *src, *dst; uint_t enc_len, keydata_len, aad_len; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); keydata_len = zio_crypt_table[crypt].ci_keylen; rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); zfs_uio_init(&cuio, &cuio_s); /* * Since we only support one buffer, we need to copy * the encrypted buffer (source) to the plain buffer * (dest). We set iovecs[0] -- the authentication data -- * below. */ dst = key->zk_master_keydata; src = keydata; memcpy(dst, src, keydata_len); dst = key->zk_hmac_keydata; src = hmac_keydata; memcpy(dst, src, SHA512_HMAC_KEYLEN); iovecs[1].iov_base = key->zk_master_keydata; iovecs[1].iov_len = keydata_len; iovecs[2].iov_base = key->zk_hmac_keydata; iovecs[2].iov_len = SHA512_HMAC_KEYLEN; iovecs[3].iov_base = mac; iovecs[3].iov_len = WRAPPING_MAC_LEN; if (version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(guid); } else { ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(guid); aad[1] = LE_64(crypt); aad[2] = LE_64(version); } enc_len = keydata_len + SHA512_HMAC_KEYLEN; iovecs[0].iov_base = aad; iovecs[0].iov_len = aad_len; GET_UIO_STRUCT(&cuio)->uio_iov = iovecs; zfs_uio_iovcnt(&cuio) = 4; zfs_uio_segflg(&cuio) = UIO_SYSSPACE; /* decrypt the keys and store the result in the output buffers */ ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey, iv, enc_len, &cuio, aad_len); if (ret != 0) goto error; /* generate a fresh salt */ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = key->zk_hmac_keydata; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); ret = freebsd_crypt_newsession(&key->zk_session, &zio_crypt_table[crypt], &key->zk_current_key); if (ret != 0) goto error; key->zk_crypt = crypt; key->zk_version = version; key->zk_guid = guid; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy_early(key); return (ret); } int zio_crypt_generate_iv(uint8_t *ivbuf) { int ret; /* randomly generate the IV */ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); if (ret != 0) goto error; return (0); error: memset(ivbuf, 0, ZIO_DATA_IV_LEN); return (ret); } int zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *digestbuf, uint_t digestlen) { uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); crypto_mac(&key->zk_hmac_key, data, datalen, raw_digestbuf, SHA512_DIGEST_LENGTH); memcpy(digestbuf, raw_digestbuf, digestlen); return (0); } int zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *ivbuf, uint8_t *salt) { int ret; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; ret = zio_crypt_do_hmac(key, data, datalen, digestbuf, SHA512_DIGEST_LENGTH); if (ret != 0) return (ret); memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN); memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN); return (0); } /* * The following functions are used to encode and decode encryption parameters * into blkptr_t and zil_header_t. The ICP wants to use these parameters as * byte strings, which normally means that these strings would not need to deal * with byteswapping at all. However, both blkptr_t and zil_header_t may be * byteswapped by lower layers and so we must "undo" that byteswap here upon * decoding and encoding in a non-native byteorder. These functions require * that the byteorder bit is correct before being called. */ void zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_ENCRYPTED(bp)); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, val32); } else { memcpy(&val64, salt, sizeof (uint64_t)); bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); memcpy(&val64, iv, sizeof (uint64_t)); bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, BSWAP_32(val32)); } } void zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_PROTECTED(bp)); /* for convenience, so callers don't need to check */ if (BP_IS_AUTHENTICATED(bp)) { memset(salt, 0, ZIO_DATA_SALT_LEN); memset(iv, 0, ZIO_DATA_IV_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); val32 = (uint32_t)BP_GET_IV2(bp); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } else { val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); memcpy(salt, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); memcpy(iv, &val64, sizeof (uint64_t)); val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } } void zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } else { memcpy(&val64, mac, sizeof (uint64_t)); bp->blk_cksum.zc_word[2] = BSWAP_64(val64); memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); bp->blk_cksum.zc_word[3] = BSWAP_64(val64); } } void zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); /* for convenience, so callers don't need to check */ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { memset(mac, 0, ZIO_DATA_MAC_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], sizeof (uint64_t)); } else { val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); memcpy(mac, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); } } void zio_crypt_encode_mac_zil(void *data, uint8_t *mac) { zil_chain_t *zilc = data; memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } void zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) { /* * The ZIL MAC is embedded in the block it protects, which will * not have been byteswapped by the time this function has been called. * As a result, we don't need to worry about byteswapping the MAC. */ const zil_chain_t *zilc = data; memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], sizeof (uint64_t)); } /* * This routine takes a block of dnodes (src_abd) and copies only the bonus * buffers to the same offsets in the dst buffer. datalen should be the size * of both the src_abd and the dst buffer (not just the length of the bonus * buffers). */ void zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) { uint_t i, max_dnp = datalen >> DNODE_SHIFT; uint8_t *src; dnode_phys_t *dnp, *sdnp, *ddnp; src = abd_borrow_buf_copy(src_abd, datalen); sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), DN_MAX_BONUS_LEN(dnp)); } } abd_return_buf(src_abd, src, datalen); } /* * This function decides what fields from blk_prop are included in * the on-disk various MAC algorithms. */ static void zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) { int avoidlint = SPA_MINBLOCKSIZE; /* * Version 0 did not properly zero out all non-portable fields * as it should have done. We maintain this code so that we can * do read-only imports of pools on this version. */ if (version == 0) { BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); BP_SET_PSIZE(bp, avoidlint); return; } ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); /* * The hole_birth feature might set these fields even if this bp * is a hole. We zero them out here to guarantee that raw sends * will function with or without the feature. */ if (BP_IS_HOLE(bp)) { bp->blk_prop = 0ULL; return; } /* * At L0 we want to verify these fields to ensure that data blocks * can not be reinterpreted. For instance, we do not want an attacker * to trick us into returning raw lz4 compressed data to the user * by modifying the compression bits. At higher levels, we cannot * enforce this policy since raw sends do not convey any information * about indirect blocks, so these values might be different on the * receive side. Fortunately, this does not open any new attack * vectors, since any alterations that can be made to a higher level * bp must still verify the correct order of the layer below it. */ if (BP_GET_LEVEL(bp) != 0) { BP_SET_BYTEORDER(bp, 0); BP_SET_COMPRESS(bp, 0); /* * psize cannot be set to zero or it will trigger * asserts, but the value doesn't really matter as * long as it is constant. */ BP_SET_PSIZE(bp, avoidlint); } BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); } static void zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, blkptr_auth_buf_t *bab, uint_t *bab_len) { blkptr_t tmpbp = *bp; if (should_bswap) byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); ASSERT0(BP_IS_EMBEDDED(&tmpbp)); zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); /* * We always MAC blk_prop in LE to ensure portability. This * must be done after decoding the mac, since the endianness * will get zero'd out here. */ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); bab->bab_prop = LE_64(tmpbp.blk_prop); bab->bab_pad = 0ULL; /* version 0 did not include the padding */ *bab_len = sizeof (blkptr_auth_buf_t); if (version == 0) *bab_len -= sizeof (uint64_t); } static int zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); crypto_mac_update(ctx, &bab, bab_len); return (0); } static void zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); SHA2Update(ctx, &bab, bab_len); } static void zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); memcpy(*aadp, &bab, bab_len); *aadp += bab_len; *aad_len += bab_len; } static int zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, dnode_phys_t *dnp) { int ret, i; dnode_phys_t *adnp; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; /* authenticate the core dnode (masking out non-portable bits) */ memcpy(tmp_dncore, dnp, sizeof (tmp_dncore)); adnp = (dnode_phys_t *)tmp_dncore; if (le_bswap) { adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); adnp->dn_used = BSWAP_64(adnp->dn_used); } adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; crypto_mac_update(ctx, adnp, sizeof (tmp_dncore)); for (i = 0; i < dnp->dn_nblkptr; i++) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, &dnp->dn_blkptr[i]); if (ret != 0) goto error; } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, DN_SPILL_BLKPTR(dnp)); if (ret != 0) goto error; } return (0); error: return (ret); } /* * objset_phys_t blocks introduce a number of exceptions to the normal * authentication process. objset_phys_t's contain 2 separate HMACS for * protecting the integrity of their data. The portable_mac protects the * metadnode. This MAC can be sent with a raw send and protects against * reordering of data within the metadnode. The local_mac protects the user * accounting objects which are not sent from one system to another. * * In addition, objset blocks are the only blocks that can be modified and * written to disk without the key loaded under certain circumstances. During * zil_claim() we need to be able to update the zil_header_t to complete * claiming log blocks and during raw receives we need to write out the * portable_mac from the send file. Both of these actions are possible * because these fields are not protected by either MAC so neither one will * need to modify the MACs without the key. However, when the modified blocks * are written out they will be byteswapped into the host machine's native * endianness which will modify fields protected by the MAC. As a result, MAC * calculation for objset blocks works slightly differently from other block * types. Where other block types MAC the data in whatever endianness is * written to disk, objset blocks always MAC little endian version of their * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() * and le_bswap indicates whether a byteswap is needed to get this block * into little endian format. */ int zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) { int ret; struct hmac_ctx hash_ctx; struct hmac_ctx *ctx = &hash_ctx; objset_phys_t *osp = data; uint64_t intval; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; /* calculate the portable MAC from the portable fields and metadnode */ crypto_mac_init(ctx, &key->zk_hmac_key); /* add in the os_type */ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); crypto_mac_update(ctx, &intval, sizeof (uint64_t)); /* add in the portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); crypto_mac_update(ctx, &intval, sizeof (uint64_t)); /* add in fields from the metadnode */ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_meta_dnode); if (ret) goto error; crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH); memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN); /* * This is necessary here as we check next whether * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to * decide if the local_mac should be zeroed out. That flag will always * be set by dmu_objset_id_quota_upgrade_cb() and * dmu_objset_userspace_upgrade_cb() if useraccounting has been * completed. */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); boolean_t uacct_incomplete = !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE); /* * The local MAC protects the user, group and project accounting. * If these objects are not present, the local MAC is zeroed out. */ if (uacct_incomplete || (datalen >= OBJSET_PHYS_SIZE_V3 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE && osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || (datalen >= OBJSET_PHYS_SIZE_V2 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || (datalen <= OBJSET_PHYS_SIZE_V1)) { memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (0); } /* calculate the local MAC from the userused and groupused dnodes */ crypto_mac_init(ctx, &key->zk_hmac_key); /* add in the non-portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); crypto_mac_update(ctx, &intval, sizeof (uint64_t)); /* XXX check dnode type ... */ /* add in fields from the user accounting dnodes */ if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_userused_dnode); if (ret) goto error; } if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_groupused_dnode); if (ret) goto error; } if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && datalen >= OBJSET_PHYS_SIZE_V3) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_projectused_dnode); if (ret) goto error; } crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH); memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN); return (0); error: memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN); memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (ret); } static void zio_crypt_destroy_uio(zfs_uio_t *uio) { if (GET_UIO_STRUCT(uio)->uio_iov) kmem_free(GET_UIO_STRUCT(uio)->uio_iov, zfs_uio_iovcnt(uio) * sizeof (iovec_t)); } /* * This function parses an uncompressed indirect block and returns a checksum * of all the portable fields from all of the contained bps. The portable * fields are the MAC and all of the fields from blk_prop except for the dedup, * checksum, and psize bits. For an explanation of the purpose of this, see * the comment block on object set authentication. */ static int zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) { blkptr_t *bp; int i, epb = datalen >> SPA_BLKPTRSHIFT; SHA2_CTX ctx; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; /* checksum all of the MACs from the layer below */ SHA2Init(SHA512, &ctx); for (i = 0, bp = buf; i < epb; i++, bp++) { zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, byteswap, bp); } SHA2Final(digestbuf, &ctx); if (generate) { memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN); return (0); } if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) { #ifdef FCRYPTO_DEBUG printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__); #endif return (SET_ERROR(ECKSUM)); } return (0); } int zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; /* * Unfortunately, callers of this function will not always have * easy access to the on-disk format version. This info is * normally found in the DSL Crypto Key, but the checksum-of-MACs * is expected to be verifiable even when the key isn't loaded. * Here, instead of doing a ZAP lookup for the version for each * zio, we simply try both existing formats. */ ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); if (ret == ECKSUM) { ASSERT(!generate); ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, 0, byteswap, cksum); } return (ret); } int zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; void *buf; buf = abd_borrow_buf_copy(abd, datalen); ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, byteswap, cksum); abd_return_buf(abd, buf, datalen); return (ret); } /* * Special case handling routine for encrypting / decrypting ZIL blocks. * We do not check for the older ZIL chain because the encryption feature * was not available before the newer ZIL chain was introduced. The goal * here is to encrypt everything except the blkptr_t of a lr_write_t and * the zil_chain_t header. Everything that is not encrypted is authenticated. */ /* * The OpenCrypto used in FreeBSD does not use separate source and * destination buffers; instead, the same buffer is used. Further, to * accommodate some of the drivers, the authbuf needs to be logically before * the data. This means that we need to copy the source to the destination, * and set up an extra iovec_t at the beginning to handle the authbuf. * It also means we'll only return one zfs_uio_t. */ static int zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { (void) puio; uint8_t *aadbuf = zio_buf_alloc(datalen); uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; iovec_t *dst_iovecs; zil_chain_t *zilc; lr_t *lr; - uint64_t txtype, lr_len; + uint64_t txtype, lr_len, nused; uint_t crypt_len, nr_iovecs, vec; uint_t aad_len = 0, total_len = 0; if (encrypt) { src = plainbuf; dst = cipherbuf; } else { src = cipherbuf; dst = plainbuf; } memcpy(dst, src, datalen); /* Find the start and end record of the log block. */ zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; - blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + ASSERT3U(nused, >=, sizeof (zil_chain_t)); + ASSERT3U(nused, <=, datalen); + blkend = src + nused; /* * Calculate the number of encrypted iovecs we will need. */ /* We need at least two iovecs -- one for the AAD, one for the MAC. */ nr_iovecs = 2; for (; slrp < blkend; slrp += lr_len) { lr = (lr_t *)slrp; if (byteswap) { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } else { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } + ASSERT3U(lr_len, >=, sizeof (lr_t)); + ASSERT3U(lr_len, <=, blkend - slrp); nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) nr_iovecs++; } dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP); /* * Copy the plain zil header over and authenticate everything except * the checksum that will store our MAC. If we are writing the data * the embedded checksum will not have been calculated yet, so we don't * authenticate that. */ memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t)); aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); slrp = src + sizeof (zil_chain_t); dlrp = dst + sizeof (zil_chain_t); /* * Loop over records again, filling in iovecs. */ /* The first iovec will contain the authbuf. */ vec = 1; for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { lr = (lr_t *)slrp; if (!byteswap) { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } else { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } /* copy the common lr_t */ memcpy(dlrp, slrp, sizeof (lr_t)); memcpy(aadp, slrp, sizeof (lr_t)); aadp += sizeof (lr_t); aad_len += sizeof (lr_t); /* * If this is a TX_WRITE record we want to encrypt everything * except the bp if exists. If the bp does exist we want to * authenticate it. */ if (txtype == TX_WRITE) { const size_t o = offsetof(lr_write_t, lr_blkptr); crypt_len = o - sizeof (lr_t); dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); dst_iovecs[vec].iov_len = crypt_len; /* copy the bp now since it will not be encrypted */ memcpy(dlrp + o, slrp + o, sizeof (blkptr_t)); memcpy(aadp, slrp + o, sizeof (blkptr_t)); aadp += sizeof (blkptr_t); aad_len += sizeof (blkptr_t); vec++; total_len += crypt_len; if (lr_len != sizeof (lr_write_t)) { crypt_len = lr_len - sizeof (lr_write_t); dst_iovecs[vec].iov_base = (char *) dlrp + sizeof (lr_write_t); dst_iovecs[vec].iov_len = crypt_len; vec++; total_len += crypt_len; } } else if (txtype == TX_CLONE_RANGE) { const size_t o = offsetof(lr_clone_range_t, lr_nbps); crypt_len = o - sizeof (lr_t); dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); dst_iovecs[vec].iov_len = crypt_len; /* copy the bps now since they will not be encrypted */ memcpy(dlrp + o, slrp + o, lr_len - o); memcpy(aadp, slrp + o, lr_len - o); aadp += lr_len - o; aad_len += lr_len - o; vec++; total_len += crypt_len; } else { crypt_len = lr_len - sizeof (lr_t); dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); dst_iovecs[vec].iov_len = crypt_len; vec++; total_len += crypt_len; } } /* The last iovec will contain the MAC. */ ASSERT3U(vec, ==, nr_iovecs - 1); /* AAD */ dst_iovecs[0].iov_base = aadbuf; dst_iovecs[0].iov_len = aad_len; /* MAC */ dst_iovecs[vec].iov_base = 0; dst_iovecs[vec].iov_len = 0; *no_crypt = (vec == 1); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs; zfs_uio_iovcnt(out_uio) = nr_iovecs; return (0); } /* * Special case handling routine for encrypting / decrypting dnode blocks. */ static int zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { uint8_t *aadbuf = zio_buf_alloc(datalen); uint8_t *src, *dst, *aadp; dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; iovec_t *dst_iovecs; uint_t nr_iovecs, crypt_len, vec; uint_t aad_len = 0, total_len = 0; uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; if (encrypt) { src = plainbuf; dst = cipherbuf; } else { src = cipherbuf; dst = plainbuf; } memcpy(dst, src, datalen); sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; aadp = aadbuf; /* * Count the number of iovecs we will need to do the encryption by * counting the number of bonus buffers that need to be encrypted. */ /* We need at least two iovecs -- one for the AAD, one for the MAC. */ nr_iovecs = 2; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { /* * This block may still be byteswapped. However, all of the * values we use are either uint8_t's (for which byteswapping * is a noop) or a * != 0 check, which will work regardless * of whether or not we byteswap. */ if (sdnp[i].dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && sdnp[i].dn_bonuslen != 0) { nr_iovecs++; } } dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP); /* * Iterate through the dnodes again, this time filling in the uios * we allocated earlier. We also concatenate any data we want to * authenticate onto aadbuf. */ /* The first iovec will contain the authbuf. */ vec = 1; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; /* copy over the core fields and blkptrs (kept as plaintext) */ memcpy(&ddnp[i], dnp, (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); } /* * Handle authenticated data. We authenticate everything in * the dnode that can be brought over when we do a raw send. * This includes all of the core fields as well as the MACs * stored in the bp checksums and all of the portable bits * from blk_prop. We include the dnode padding here in case it * ever gets used in the future. Some dn_flags and dn_used are * not portable so we mask those out values out of the * authenticated data. */ crypt_len = offsetof(dnode_phys_t, dn_blkptr); memcpy(aadp, dnp, crypt_len); adnp = (dnode_phys_t *)aadp; adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; aadp += crypt_len; aad_len += crypt_len; for (j = 0; j < dnp->dn_nblkptr; j++) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, &dnp->dn_blkptr[j]); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, DN_SPILL_BLKPTR(dnp)); } /* * If this bonus buffer needs to be encrypted, we prepare an * iovec_t. The encryption / decryption functions will fill * this in for us with the encrypted or decrypted data. * Otherwise we add the bonus buffer to the authenticated * data buffer and copy it over to the destination. The * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that * we can guarantee alignment with the AES block size * (128 bits). */ crypt_len = DN_MAX_BONUS_LEN(dnp); if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { dst_iovecs[vec].iov_base = DN_BONUS(&ddnp[i]); dst_iovecs[vec].iov_len = crypt_len; vec++; total_len += crypt_len; } else { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len); memcpy(aadp, DN_BONUS(dnp), crypt_len); aadp += crypt_len; aad_len += crypt_len; } } /* The last iovec will contain the MAC. */ ASSERT3U(vec, ==, nr_iovecs - 1); /* AAD */ dst_iovecs[0].iov_base = aadbuf; dst_iovecs[0].iov_len = aad_len; /* MAC */ dst_iovecs[vec].iov_base = 0; dst_iovecs[vec].iov_len = 0; *no_crypt = (vec == 1); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs; zfs_uio_iovcnt(out_uio) = nr_iovecs; return (0); } static int zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len) { (void) puio; int ret; uint_t nr_plain = 1, nr_cipher = 2; iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; void *src, *dst; cipher_iovecs = kmem_zalloc(nr_cipher * sizeof (iovec_t), KM_SLEEP); if (!cipher_iovecs) { ret = SET_ERROR(ENOMEM); goto error; } if (encrypt) { src = plainbuf; dst = cipherbuf; } else { src = cipherbuf; dst = plainbuf; } memcpy(dst, src, datalen); cipher_iovecs[0].iov_base = dst; cipher_iovecs[0].iov_len = datalen; *enc_len = datalen; GET_UIO_STRUCT(out_uio)->uio_iov = cipher_iovecs; zfs_uio_iovcnt(out_uio) = nr_cipher; return (0); error: if (plain_iovecs != NULL) kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); if (cipher_iovecs != NULL) kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); *enc_len = 0; GET_UIO_STRUCT(out_uio)->uio_iov = NULL; zfs_uio_iovcnt(out_uio) = 0; return (ret); } /* * This function builds up the plaintext (puio) and ciphertext (cuio) uios so * that they can be used for encryption and decryption by zio_do_crypt_uio(). * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks * requiring special handling to parse out pieces that are to be encrypted. The * authbuf is used by these special cases to store additional authenticated * data (AAD) for the encryption modes. */ static int zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; iovec_t *mac_iov; ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); /* route to handler */ switch (ot) { case DMU_OT_INTENT_LOG: ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; case DMU_OT_DNODE: ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; default: ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, datalen, puio, cuio, enc_len); *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; break; } if (ret != 0) goto error; /* populate the uios */ zfs_uio_segflg(cuio) = UIO_SYSSPACE; mac_iov = ((iovec_t *)&(GET_UIO_STRUCT(cuio)-> uio_iov[zfs_uio_iovcnt(cuio) - 1])); mac_iov->iov_base = (void *)mac; mac_iov->iov_len = ZIO_DATA_MAC_LEN; return (0); error: return (ret); } void *failed_decrypt_buf; int faile_decrypt_size; /* * Primary encryption / decryption entrypoint for zio data. */ int zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, boolean_t *no_crypt) { int ret; boolean_t locked = B_FALSE; uint64_t crypt = key->zk_crypt; uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; uint_t enc_len, auth_len; zfs_uio_t puio, cuio; struct uio puio_s, cuio_s; uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; crypto_key_t tmp_ckey, *ckey = NULL; freebsd_crypt_session_t *tmpl = NULL; uint8_t *authbuf = NULL; zfs_uio_init(&puio, &puio_s); zfs_uio_init(&cuio, &cuio_s); memset(GET_UIO_STRUCT(&puio), 0, sizeof (struct uio)); memset(GET_UIO_STRUCT(&cuio), 0, sizeof (struct uio)); #ifdef FCRYPTO_DEBUG printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n", __FUNCTION__, encrypt ? "encrypt" : "decrypt", key, salt, ot, iv, mac, datalen, byteswap ? "byteswap" : "native_endian", plainbuf, cipherbuf, no_crypt); printf("\tkey = {"); for (int i = 0; i < key->zk_current_key.ck_length/8; i++) printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]); printf("}\n"); #endif /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, &authbuf, &auth_len, no_crypt); if (ret != 0) return (ret); /* * If the needed key is the current one, just use it. Otherwise we * need to generate a temporary one from the given salt + master key. * If we are encrypting, we must return a copy of the current salt * so that it can be stored in the blkptr_t. */ rw_enter(&key->zk_salt_lock, RW_READER); locked = B_TRUE; if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { ckey = &key->zk_current_key; tmpl = &key->zk_session; } else { rw_exit(&key->zk_salt_lock); locked = B_FALSE; ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); if (ret != 0) goto error; tmp_ckey.ck_data = enc_keydata; tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); ckey = &tmp_ckey; tmpl = NULL; } /* perform the encryption / decryption */ ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt, ckey, iv, enc_len, &cuio, auth_len); if (ret != 0) goto error; if (locked) { rw_exit(&key->zk_salt_lock); } if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (0); error: if (!encrypt) { if (failed_decrypt_buf != NULL) kmem_free(failed_decrypt_buf, failed_decrypt_size); failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP); failed_decrypt_size = datalen; memcpy(failed_decrypt_buf, cipherbuf, datalen); } if (locked) rw_exit(&key->zk_salt_lock); if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (SET_ERROR(ret)); } /* * Simple wrapper around zio_do_crypt_data() to work with abd's instead of * linear buffers. */ int zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) { int ret; void *ptmp, *ctmp; if (encrypt) { ptmp = abd_borrow_buf_copy(pabd, datalen); ctmp = abd_borrow_buf(cabd, datalen); } else { ptmp = abd_borrow_buf(pabd, datalen); ctmp = abd_borrow_buf_copy(cabd, datalen); } ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, datalen, ptmp, ctmp, no_crypt); if (ret != 0) goto error; if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (0); error: if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (SET_ERROR(ret)); } #if defined(_KERNEL) && defined(HAVE_SPL) /* CSTYLED */ module_param(zfs_key_max_salt_uses, ulong, 0644); MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " "can be used for generating encryption keys before it is rotated"); #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c index 55f807ccfc13..21f3740f6fe6 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c @@ -1,2075 +1,2080 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2017, Datto, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * This file is responsible for handling all of the details of generating * encryption parameters and performing encryption and authentication. * * BLOCK ENCRYPTION PARAMETERS: * Encryption /Authentication Algorithm Suite (crypt): * The encryption algorithm, mode, and key length we are going to use. We * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit * keys. All authentication is currently done with SHA512-HMAC. * * Plaintext: * The unencrypted data that we want to encrypt. * * Initialization Vector (IV): * An initialization vector for the encryption algorithms. This is used to * "tweak" the encryption algorithms so that two blocks of the same data are * encrypted into different ciphertext outputs, thus obfuscating block patterns. * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is * never reused with the same encryption key. This value is stored unencrypted * and must simply be provided to the decryption function. We use a 96 bit IV * (as recommended by NIST) for all block encryption. For non-dedup blocks we * derive the IV randomly. The first 64 bits of the IV are stored in the second * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of * level 0 blocks is the number of allocated dnodes in that block. The on-disk * format supports at most 2^15 slots per L0 dnode block, because the maximum * block size is 16MB (2^24). In either case, for level 0 blocks this number * will still be smaller than UINT32_MAX so it is safe to store the IV in the * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count * for the dnode code. * * Master key: * This is the most important secret data of an encrypted dataset. It is used * along with the salt to generate that actual encryption keys via HKDF. We * do not use the master key to directly encrypt any data because there are * theoretical limits on how much data can actually be safely encrypted with * any encryption mode. The master key is stored encrypted on disk with the * user's wrapping key. Its length is determined by the encryption algorithm. * For details on how this is stored see the block comment in dsl_crypt.c * * Salt: * Used as an input to the HKDF function, along with the master key. We use a * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt * can be used for encrypting many blocks, so we cache the current salt and the * associated derived key in zio_crypt_t so we do not need to derive it again * needlessly. * * Encryption Key: * A secret binary key, generated from an HKDF function used to encrypt and * decrypt data. * * Message Authentication Code (MAC) * The MAC is an output of authenticated encryption modes such as AES-GCM and * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted * data on disk and return garbage to the application. Effectively, it is a * checksum that can not be reproduced by an attacker. We store the MAC in the * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated * regular checksum of the ciphertext which can be used for scrubbing. * * OBJECT AUTHENTICATION: * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because * they contain some info that always needs to be readable. To prevent this * data from being altered, we authenticate this data using SHA512-HMAC. This * will produce a MAC (similar to the one produced via encryption) which can * be used to verify the object was not modified. HMACs do not require key * rotation or IVs, so we can keep up to the full 3 copies of authenticated * data. * * ZIL ENCRYPTION: * ZIL blocks have their bp written to disk ahead of the associated data, so we * cannot store the MAC there as we normally do. For these blocks the MAC is * stored in the embedded checksum within the zil_chain_t header. The salt and * IV are generated for the block on bp allocation instead of at encryption * time. In addition, ZIL blocks have some pieces that must be left in plaintext * for claiming even though all of the sensitive user data still needs to be * encrypted. The function zio_crypt_init_uios_zil() handles parsing which * pieces of the block need to be encrypted. All data that is not encrypted is * authenticated using the AAD mechanisms that the supported encryption modes * provide for. In order to preserve the semantics of the ZIL for encrypted * datasets, the ZIL is not protected at the objset level as described below. * * DNODE ENCRYPTION: * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left * in plaintext for scrubbing and claiming, but the bonus buffers might contain * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing * which pieces of the block need to be encrypted. For more details about * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). * * OBJECT SET AUTHENTICATION: * Up to this point, everything we have encrypted and authenticated has been * at level 0 (or -2 for the ZIL). If we did not do any further work the * on-disk format would be susceptible to attacks that deleted or rearranged * the order of level 0 blocks. Ideally, the cleanest solution would be to * maintain a tree of authentication MACs going up the bp tree. However, this * presents a problem for raw sends. Send files do not send information about * indirect blocks so there would be no convenient way to transfer the MACs and * they cannot be recalculated on the receive side without the master key which * would defeat one of the purposes of raw sends in the first place. Instead, * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs * from the level below. We also include some portable fields from blk_prop such * as the lsize and compression algorithm to prevent the data from being * misinterpreted. * * At the objset level, we maintain 2 separate 256 bit MACs in the * objset_phys_t. The first one is "portable" and is the logical root of the * MAC tree maintained in the metadnode's bps. The second, is "local" and is * used as the root MAC for the user accounting objects, which are also not * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload * of the send file. The useraccounting code ensures that the useraccounting * info is not present upon a receive, so the local MAC can simply be cleared * out at that time. For more info about objset_phys_t authentication, see * zio_crypt_do_objset_hmacs(). * * CONSIDERATIONS FOR DEDUP: * In order for dedup to work, blocks that we want to dedup with one another * need to use the same IV and encryption key, so that they will have the same * ciphertext. Normally, one should never reuse an IV with the same encryption * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both * blocks. In this case, however, since we are using the same plaintext as * well all that we end up with is a duplicate of the original ciphertext we * already had. As a result, an attacker with read access to the raw disk will * be able to tell which blocks are the same but this information is given away * by dedup anyway. In order to get the same IVs and encryption keys for * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC * here so that a reproducible checksum of the plaintext is never available to * the attacker. The HMAC key is kept alongside the master key, encrypted on * disk. The first 64 bits of the HMAC are used in place of the random salt, and * the next 96 bits are used as the IV. As a result of this mechanism, dedup * will only work within a clone family since encrypted dedup requires use of * the same master and HMAC keys. */ /* * After encrypting many blocks with the same key we may start to run up * against the theoretical limits of how much data can securely be encrypted * with a single key using the supported encryption modes. The most obvious * limitation is that our risk of generating 2 equivalent 96 bit IVs increases * the more IVs we generate (which both GCM and CCM modes strictly forbid). * This risk actually grows surprisingly quickly over time according to the * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have * generated n IVs with a cryptographically secure RNG, the approximate * probability p(n) of a collision is given as: * * p(n) ~= e^(-n*(n-1)/(2*(2^96))) * * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] * * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion * we must not write more than 398,065,730 blocks with the same encryption key. * Therefore, we rotate our keys after 400,000,000 blocks have been written by * generating a new random 64 bit salt for our HKDF encryption key generation * function. */ #define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 #define ZFS_CURRENT_MAX_SALT_USES \ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; typedef struct blkptr_auth_buf { uint64_t bab_prop; /* blk_prop - portable mask */ uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ uint64_t bab_pad; /* reserved for future use */ } blkptr_auth_buf_t; const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { {"", ZC_TYPE_NONE, 0, "inherit"}, {"", ZC_TYPE_NONE, 0, "on"}, {"", ZC_TYPE_NONE, 0, "off"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} }; void zio_crypt_key_destroy(zio_crypt_key_t *key) { rw_destroy(&key->zk_salt_lock); /* free crypto templates */ crypto_destroy_ctx_template(key->zk_current_tmpl); crypto_destroy_ctx_template(key->zk_hmac_tmpl); /* zero out sensitive data */ memset(key, 0, sizeof (zio_crypt_key_t)); } int zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) { int ret; crypto_mechanism_t mech = {0}; uint_t keydata_len; ASSERT(key != NULL); ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); /* * Workaround for GCC 12+ with UBSan enabled deficencies. * * GCC 12+ invoked with -fsanitize=undefined incorrectly reports the code * below as violating -Warray-bounds */ #if defined(__GNUC__) && !defined(__clang__) && \ ((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \ defined(CONFIG_UBSAN)) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" #endif keydata_len = zio_crypt_table[crypt].ci_keylen; #if defined(__GNUC__) && !defined(__clang__) && \ ((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \ defined(CONFIG_UBSAN)) #pragma GCC diagnostic pop #endif memset(key, 0, sizeof (zio_crypt_key_t)); rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); /* fill keydata buffers and salt with random data */ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); if (ret != 0) goto error; ret = random_get_bytes(key->zk_master_keydata, keydata_len); if (ret != 0) goto error; ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); if (ret != 0) goto error; ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for the ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = &key->zk_hmac_key; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); /* * Initialize the crypto templates. It's ok if this fails because * this is just an optimization. */ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, &key->zk_hmac_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_hmac_tmpl = NULL; key->zk_crypt = crypt; key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy(key); return (ret); } static int zio_crypt_key_change_salt(zio_crypt_key_t *key) { int ret = 0; uint8_t salt[ZIO_DATA_SALT_LEN]; crypto_mechanism_t mech; uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; /* generate a new salt */ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; rw_enter(&key->zk_salt_lock, RW_WRITER); /* someone beat us to the salt rotation, just unlock and return */ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) goto out_unlock; /* derive the current key from the master key and the new salt */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto out_unlock; /* assign the salt and reset the usage count */ memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); key->zk_salt_count = 0; /* destroy the old context template and create the new one */ crypto_destroy_ctx_template(key->zk_current_tmpl); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; rw_exit(&key->zk_salt_lock); return (0); out_unlock: rw_exit(&key->zk_salt_lock); error: return (ret); } /* See comment above zfs_key_max_salt_uses definition for details */ int zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) { int ret; boolean_t salt_change; rw_enter(&key->zk_salt_lock, RW_READER); memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= ZFS_CURRENT_MAX_SALT_USES); rw_exit(&key->zk_salt_lock); if (salt_change) { ret = zio_crypt_key_change_salt(key); if (ret != 0) goto error; } return (0); error: return (ret); } /* * This function handles all encryption and decryption in zfs. When * encrypting it expects puio to reference the plaintext and cuio to * reference the ciphertext. cuio must have enough space for the * ciphertext + room for a MAC. datalen should be the length of the * plaintext / ciphertext alone. */ static int zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio, uint8_t *authbuf, uint_t auth_len) { int ret; crypto_data_t plaindata, cipherdata; CK_AES_CCM_PARAMS ccmp; CK_AES_GCM_PARAMS gcmp; crypto_mechanism_t mech; zio_crypt_info_t crypt_info; uint_t plain_full_len, maclen; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); /* lookup the encryption info */ crypt_info = zio_crypt_table[crypt]; /* the mac will always be the last iovec_t in the cipher uio */ maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len; ASSERT(maclen <= ZIO_DATA_MAC_LEN); /* setup encryption mechanism (same as crypt) */ mech.cm_type = crypto_mech2id(crypt_info.ci_mechname); /* * Strangely, the ICP requires that plain_full_len must include * the MAC length when decrypting, even though the UIO does not * need to have the extra space allocated. */ if (encrypt) { plain_full_len = datalen; } else { plain_full_len = datalen + maclen; } /* * setup encryption params (currently only AES CCM and AES GCM * are supported) */ if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) { ccmp.ulNonceSize = ZIO_DATA_IV_LEN; ccmp.ulAuthDataSize = auth_len; ccmp.authData = authbuf; ccmp.ulMACSize = maclen; ccmp.nonce = ivbuf; ccmp.ulDataSize = plain_full_len; mech.cm_param = (char *)(&ccmp); mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS); } else { gcmp.ulIvLen = ZIO_DATA_IV_LEN; gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN); gcmp.ulAADLen = auth_len; gcmp.pAAD = authbuf; gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen); gcmp.pIv = ivbuf; mech.cm_param = (char *)(&gcmp); mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); } /* populate the cipher and plain data structs. */ plaindata.cd_format = CRYPTO_DATA_UIO; plaindata.cd_offset = 0; plaindata.cd_uio = puio; plaindata.cd_length = plain_full_len; cipherdata.cd_format = CRYPTO_DATA_UIO; cipherdata.cd_offset = 0; cipherdata.cd_uio = cuio; cipherdata.cd_length = datalen + maclen; /* perform the actual encryption */ if (encrypt) { ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } } else { ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata); if (ret != CRYPTO_SUCCESS) { ASSERT3U(ret, ==, CRYPTO_INVALID_MAC); ret = SET_ERROR(ECKSUM); goto error; } } return (0); error: return (ret); } int zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) { int ret; zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint64_t crypt = key->zk_crypt; uint_t enc_len, keydata_len, aad_len; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); keydata_len = zio_crypt_table[crypt].ci_keylen; /* generate iv for wrapping the master and hmac key */ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); if (ret != 0) goto error; /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[0].iov_base = keydata_out; cipher_iovecs[0].iov_len = keydata_len; cipher_iovecs[1].iov_base = hmac_keydata_out; cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[2].iov_base = mac; cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; /* * Although we don't support writing to the old format, we do * support rewrapping the key so that the user can move and * quarantine datasets on the old format. */ if (key->zk_version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(key->zk_guid); } else { ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(key->zk_guid); aad[1] = LE_64(crypt); aad[2] = LE_64(key->zk_version); } enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; puio.uio_iov = plain_iovecs; puio.uio_iovcnt = 2; puio.uio_segflg = UIO_SYSSPACE; cuio.uio_iov = cipher_iovecs; cuio.uio_iovcnt = 3; cuio.uio_segflg = UIO_SYSSPACE; /* encrypt the keys and store the resulting ciphertext and mac */ ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len, &puio, &cuio, (uint8_t *)aad, aad_len); if (ret != 0) goto error; return (0); error: return (ret); } int zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, uint8_t *mac, zio_crypt_key_t *key) { crypto_mechanism_t mech; zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint_t enc_len, keydata_len, aad_len; int ret; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); keydata_len = zio_crypt_table[crypt].ci_keylen; /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[0].iov_base = keydata; cipher_iovecs[0].iov_len = keydata_len; cipher_iovecs[1].iov_base = hmac_keydata; cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[2].iov_base = mac; cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; if (version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(guid); } else { ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(guid); aad[1] = LE_64(crypt); aad[2] = LE_64(version); } enc_len = keydata_len + SHA512_HMAC_KEYLEN; puio.uio_iov = plain_iovecs; puio.uio_segflg = UIO_SYSSPACE; puio.uio_iovcnt = 2; cuio.uio_iov = cipher_iovecs; cuio.uio_iovcnt = 3; cuio.uio_segflg = UIO_SYSSPACE; /* decrypt the keys and store the result in the output buffers */ ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len, &puio, &cuio, (uint8_t *)aad, aad_len); if (ret != 0) goto error; /* generate a fresh salt */ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = key->zk_hmac_keydata; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); /* * Initialize the crypto templates. It's ok if this fails because * this is just an optimization. */ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, &key->zk_hmac_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_hmac_tmpl = NULL; key->zk_crypt = crypt; key->zk_version = version; key->zk_guid = guid; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy(key); return (ret); } int zio_crypt_generate_iv(uint8_t *ivbuf) { int ret; /* randomly generate the IV */ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); if (ret != 0) goto error; return (0); error: memset(ivbuf, 0, ZIO_DATA_IV_LEN); return (ret); } int zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *digestbuf, uint_t digestlen) { int ret; crypto_mechanism_t mech; crypto_data_t in_data, digest_data; uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); /* initialize sha512-hmac mechanism and crypto data */ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); mech.cm_param = NULL; mech.cm_param_len = 0; /* initialize the crypto data */ in_data.cd_format = CRYPTO_DATA_RAW; in_data.cd_offset = 0; in_data.cd_length = datalen; in_data.cd_raw.iov_base = (char *)data; in_data.cd_raw.iov_len = in_data.cd_length; digest_data.cd_format = CRYPTO_DATA_RAW; digest_data.cd_offset = 0; digest_data.cd_length = SHA512_DIGEST_LENGTH; digest_data.cd_raw.iov_base = (char *)raw_digestbuf; digest_data.cd_raw.iov_len = digest_data.cd_length; /* generate the hmac */ ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl, &digest_data); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } memcpy(digestbuf, raw_digestbuf, digestlen); return (0); error: memset(digestbuf, 0, digestlen); return (ret); } int zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *ivbuf, uint8_t *salt) { int ret; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; ret = zio_crypt_do_hmac(key, data, datalen, digestbuf, SHA512_DIGEST_LENGTH); if (ret != 0) return (ret); memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN); memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN); return (0); } /* * The following functions are used to encode and decode encryption parameters * into blkptr_t and zil_header_t. The ICP wants to use these parameters as * byte strings, which normally means that these strings would not need to deal * with byteswapping at all. However, both blkptr_t and zil_header_t may be * byteswapped by lower layers and so we must "undo" that byteswap here upon * decoding and encoding in a non-native byteorder. These functions require * that the byteorder bit is correct before being called. */ void zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_ENCRYPTED(bp)); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, val32); } else { memcpy(&val64, salt, sizeof (uint64_t)); bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); memcpy(&val64, iv, sizeof (uint64_t)); bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, BSWAP_32(val32)); } } void zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_PROTECTED(bp)); /* for convenience, so callers don't need to check */ if (BP_IS_AUTHENTICATED(bp)) { memset(salt, 0, ZIO_DATA_SALT_LEN); memset(iv, 0, ZIO_DATA_IV_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); val32 = (uint32_t)BP_GET_IV2(bp); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } else { val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); memcpy(salt, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); memcpy(iv, &val64, sizeof (uint64_t)); val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } } void zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } else { memcpy(&val64, mac, sizeof (uint64_t)); bp->blk_cksum.zc_word[2] = BSWAP_64(val64); memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); bp->blk_cksum.zc_word[3] = BSWAP_64(val64); } } void zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); /* for convenience, so callers don't need to check */ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { memset(mac, 0, ZIO_DATA_MAC_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], sizeof (uint64_t)); } else { val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); memcpy(mac, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); } } void zio_crypt_encode_mac_zil(void *data, uint8_t *mac) { zil_chain_t *zilc = data; memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } void zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) { /* * The ZIL MAC is embedded in the block it protects, which will * not have been byteswapped by the time this function has been called. * As a result, we don't need to worry about byteswapping the MAC. */ const zil_chain_t *zilc = data; memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], sizeof (uint64_t)); } /* * This routine takes a block of dnodes (src_abd) and copies only the bonus * buffers to the same offsets in the dst buffer. datalen should be the size * of both the src_abd and the dst buffer (not just the length of the bonus * buffers). */ void zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) { uint_t i, max_dnp = datalen >> DNODE_SHIFT; uint8_t *src; dnode_phys_t *dnp, *sdnp, *ddnp; src = abd_borrow_buf_copy(src_abd, datalen); sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), DN_MAX_BONUS_LEN(dnp)); } } abd_return_buf(src_abd, src, datalen); } /* * This function decides what fields from blk_prop are included in * the on-disk various MAC algorithms. */ static void zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) { /* * Version 0 did not properly zero out all non-portable fields * as it should have done. We maintain this code so that we can * do read-only imports of pools on this version. */ if (version == 0) { BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); return; } ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); /* * The hole_birth feature might set these fields even if this bp * is a hole. We zero them out here to guarantee that raw sends * will function with or without the feature. */ if (BP_IS_HOLE(bp)) { bp->blk_prop = 0ULL; return; } /* * At L0 we want to verify these fields to ensure that data blocks * can not be reinterpreted. For instance, we do not want an attacker * to trick us into returning raw lz4 compressed data to the user * by modifying the compression bits. At higher levels, we cannot * enforce this policy since raw sends do not convey any information * about indirect blocks, so these values might be different on the * receive side. Fortunately, this does not open any new attack * vectors, since any alterations that can be made to a higher level * bp must still verify the correct order of the layer below it. */ if (BP_GET_LEVEL(bp) != 0) { BP_SET_BYTEORDER(bp, 0); BP_SET_COMPRESS(bp, 0); /* * psize cannot be set to zero or it will trigger * asserts, but the value doesn't really matter as * long as it is constant. */ BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); } BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); } static void zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, blkptr_auth_buf_t *bab, uint_t *bab_len) { blkptr_t tmpbp = *bp; if (should_bswap) byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); ASSERT0(BP_IS_EMBEDDED(&tmpbp)); zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); /* * We always MAC blk_prop in LE to ensure portability. This * must be done after decoding the mac, since the endianness * will get zero'd out here. */ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); bab->bab_prop = LE_64(tmpbp.blk_prop); bab->bab_pad = 0ULL; /* version 0 did not include the padding */ *bab_len = sizeof (blkptr_auth_buf_t); if (version == 0) *bab_len -= sizeof (uint64_t); } static int zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { int ret; uint_t bab_len; blkptr_auth_buf_t bab; crypto_data_t cd; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; cd.cd_length = bab_len; cd.cd_raw.iov_base = (char *)&bab; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } return (0); error: return (ret); } static void zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); SHA2Update(ctx, &bab, bab_len); } static void zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); memcpy(*aadp, &bab, bab_len); *aadp += bab_len; *aad_len += bab_len; } static int zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, dnode_phys_t *dnp) { int ret, i; dnode_phys_t *adnp, tmp_dncore; size_t dn_core_size = offsetof(dnode_phys_t, dn_blkptr); boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); crypto_data_t cd; cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; /* * Authenticate the core dnode (masking out non-portable bits). * We only copy the first 64 bytes we operate on to avoid the overhead * of copying 512-64 unneeded bytes. The compiler seems to be fine * with that. */ memcpy(&tmp_dncore, dnp, dn_core_size); adnp = &tmp_dncore; if (le_bswap) { adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); adnp->dn_used = BSWAP_64(adnp->dn_used); } adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; cd.cd_length = dn_core_size; cd.cd_raw.iov_base = (char *)adnp; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } for (i = 0; i < dnp->dn_nblkptr; i++) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, &dnp->dn_blkptr[i]); if (ret != 0) goto error; } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, DN_SPILL_BLKPTR(dnp)); if (ret != 0) goto error; } return (0); error: return (ret); } /* * objset_phys_t blocks introduce a number of exceptions to the normal * authentication process. objset_phys_t's contain 2 separate HMACS for * protecting the integrity of their data. The portable_mac protects the * metadnode. This MAC can be sent with a raw send and protects against * reordering of data within the metadnode. The local_mac protects the user * accounting objects which are not sent from one system to another. * * In addition, objset blocks are the only blocks that can be modified and * written to disk without the key loaded under certain circumstances. During * zil_claim() we need to be able to update the zil_header_t to complete * claiming log blocks and during raw receives we need to write out the * portable_mac from the send file. Both of these actions are possible * because these fields are not protected by either MAC so neither one will * need to modify the MACs without the key. However, when the modified blocks * are written out they will be byteswapped into the host machine's native * endianness which will modify fields protected by the MAC. As a result, MAC * calculation for objset blocks works slightly differently from other block * types. Where other block types MAC the data in whatever endianness is * written to disk, objset blocks always MAC little endian version of their * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() * and le_bswap indicates whether a byteswap is needed to get this block * into little endian format. */ int zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) { int ret; crypto_mechanism_t mech; crypto_context_t ctx; crypto_data_t cd; objset_phys_t *osp = data; uint64_t intval; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; /* initialize HMAC mechanism */ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); mech.cm_param = NULL; mech.cm_param_len = 0; cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; /* calculate the portable MAC from the portable fields and metadnode */ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the os_type */ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in fields from the metadnode */ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_meta_dnode); if (ret) goto error; /* store the final digest in a temporary buffer and copy what we need */ cd.cd_length = SHA512_DIGEST_LENGTH; cd.cd_raw.iov_base = (char *)raw_portable_mac; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_final(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN); /* * This is necessary here as we check next whether * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to * decide if the local_mac should be zeroed out. That flag will always * be set by dmu_objset_id_quota_upgrade_cb() and * dmu_objset_userspace_upgrade_cb() if useraccounting has been * completed. */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); boolean_t uacct_incomplete = !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE); /* * The local MAC protects the user, group and project accounting. * If these objects are not present, the local MAC is zeroed out. */ if (uacct_incomplete || (datalen >= OBJSET_PHYS_SIZE_V3 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE && osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || (datalen >= OBJSET_PHYS_SIZE_V2 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || (datalen <= OBJSET_PHYS_SIZE_V1)) { memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (0); } /* calculate the local MAC from the userused and groupused dnodes */ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the non-portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in fields from the user accounting dnodes */ if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_userused_dnode); if (ret) goto error; } if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_groupused_dnode); if (ret) goto error; } if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && datalen >= OBJSET_PHYS_SIZE_V3) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_projectused_dnode); if (ret) goto error; } /* store the final digest in a temporary buffer and copy what we need */ cd.cd_length = SHA512_DIGEST_LENGTH; cd.cd_raw.iov_base = (char *)raw_local_mac; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_final(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN); return (0); error: memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN); memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (ret); } static void zio_crypt_destroy_uio(zfs_uio_t *uio) { if (uio->uio_iov) kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); } /* * This function parses an uncompressed indirect block and returns a checksum * of all the portable fields from all of the contained bps. The portable * fields are the MAC and all of the fields from blk_prop except for the dedup, * checksum, and psize bits. For an explanation of the purpose of this, see * the comment block on object set authentication. */ static int zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) { blkptr_t *bp; int i, epb = datalen >> SPA_BLKPTRSHIFT; SHA2_CTX ctx; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; /* checksum all of the MACs from the layer below */ SHA2Init(SHA512, &ctx); for (i = 0, bp = buf; i < epb; i++, bp++) { zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, byteswap, bp); } SHA2Final(digestbuf, &ctx); if (generate) { memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN); return (0); } if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) return (SET_ERROR(ECKSUM)); return (0); } int zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; /* * Unfortunately, callers of this function will not always have * easy access to the on-disk format version. This info is * normally found in the DSL Crypto Key, but the checksum-of-MACs * is expected to be verifiable even when the key isn't loaded. * Here, instead of doing a ZAP lookup for the version for each * zio, we simply try both existing formats. */ ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); if (ret == ECKSUM) { ASSERT(!generate); ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, 0, byteswap, cksum); } return (ret); } int zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; void *buf; buf = abd_borrow_buf_copy(abd, datalen); ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, byteswap, cksum); abd_return_buf(abd, buf, datalen); return (ret); } /* * Special case handling routine for encrypting / decrypting ZIL blocks. * We do not check for the older ZIL chain because the encryption feature * was not available before the newer ZIL chain was introduced. The goal * here is to encrypt everything except the blkptr_t of a lr_write_t and * the zil_chain_t header. Everything that is not encrypted is authenticated. */ static int zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; - uint64_t txtype, lr_len; + uint64_t txtype, lr_len, nused; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; zil_chain_t *zilc; lr_t *lr; uint8_t *aadbuf = zio_buf_alloc(datalen); /* cipherbuf always needs an extra iovec for the MAC */ if (encrypt) { src = plainbuf; dst = cipherbuf; nr_src = 0; nr_dst = 1; } else { src = cipherbuf; dst = plainbuf; nr_src = 1; nr_dst = 0; } memset(dst, 0, datalen); /* find the start and end record of the log block */ zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; - blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + ASSERT3U(nused, >=, sizeof (zil_chain_t)); + ASSERT3U(nused, <=, datalen); + blkend = src + nused; /* calculate the number of encrypted iovecs we will need */ for (; slrp < blkend; slrp += lr_len) { lr = (lr_t *)slrp; if (!byteswap) { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } else { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } + ASSERT3U(lr_len, >=, sizeof (lr_t)); + ASSERT3U(lr_len, <=, blkend - slrp); nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) nr_iovecs++; } nr_src += nr_iovecs; nr_dst += nr_iovecs; /* allocate the iovec arrays */ if (nr_src != 0) { src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); if (src_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } if (nr_dst != 0) { dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); if (dst_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } /* * Copy the plain zil header over and authenticate everything except * the checksum that will store our MAC. If we are writing the data * the embedded checksum will not have been calculated yet, so we don't * authenticate that. */ memcpy(dst, src, sizeof (zil_chain_t)); memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t)); aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); /* loop over records again, filling in iovecs */ nr_iovecs = 0; slrp = src + sizeof (zil_chain_t); dlrp = dst + sizeof (zil_chain_t); for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { lr = (lr_t *)slrp; if (!byteswap) { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } else { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } /* copy the common lr_t */ memcpy(dlrp, slrp, sizeof (lr_t)); memcpy(aadp, slrp, sizeof (lr_t)); aadp += sizeof (lr_t); aad_len += sizeof (lr_t); ASSERT3P(src_iovecs, !=, NULL); ASSERT3P(dst_iovecs, !=, NULL); /* * If this is a TX_WRITE record we want to encrypt everything * except the bp if exists. If the bp does exist we want to * authenticate it. */ if (txtype == TX_WRITE) { const size_t o = offsetof(lr_write_t, lr_blkptr); crypt_len = o - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; /* copy the bp now since it will not be encrypted */ memcpy(dlrp + o, slrp + o, sizeof (blkptr_t)); memcpy(aadp, slrp + o, sizeof (blkptr_t)); aadp += sizeof (blkptr_t); aad_len += sizeof (blkptr_t); nr_iovecs++; total_len += crypt_len; if (lr_len != sizeof (lr_write_t)) { crypt_len = lr_len - sizeof (lr_write_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_write_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_write_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } } else if (txtype == TX_CLONE_RANGE) { const size_t o = offsetof(lr_clone_range_t, lr_nbps); crypt_len = o - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; /* copy the bps now since they will not be encrypted */ memcpy(dlrp + o, slrp + o, lr_len - o); memcpy(aadp, slrp + o, lr_len - o); aadp += lr_len - o; aad_len += lr_len - o; nr_iovecs++; total_len += crypt_len; } else { crypt_len = lr_len - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } } *no_crypt = (nr_iovecs == 0); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; if (encrypt) { puio->uio_iov = src_iovecs; puio->uio_iovcnt = nr_src; cuio->uio_iov = dst_iovecs; cuio->uio_iovcnt = nr_dst; } else { puio->uio_iov = dst_iovecs; puio->uio_iovcnt = nr_dst; cuio->uio_iov = src_iovecs; cuio->uio_iovcnt = nr_src; } return (0); error: zio_buf_free(aadbuf, datalen); if (src_iovecs != NULL) kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); if (dst_iovecs != NULL) kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); *enc_len = 0; *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } /* * Special case handling routine for encrypting / decrypting dnode blocks. */ static int zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; uint8_t *src, *dst, *aadp; dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; uint8_t *aadbuf = zio_buf_alloc(datalen); if (encrypt) { src = plainbuf; dst = cipherbuf; nr_src = 0; nr_dst = 1; } else { src = cipherbuf; dst = plainbuf; nr_src = 1; nr_dst = 0; } sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; aadp = aadbuf; /* * Count the number of iovecs we will need to do the encryption by * counting the number of bonus buffers that need to be encrypted. */ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { /* * This block may still be byteswapped. However, all of the * values we use are either uint8_t's (for which byteswapping * is a noop) or a * != 0 check, which will work regardless * of whether or not we byteswap. */ if (sdnp[i].dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && sdnp[i].dn_bonuslen != 0) { nr_iovecs++; } } nr_src += nr_iovecs; nr_dst += nr_iovecs; if (nr_src != 0) { src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); if (src_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } if (nr_dst != 0) { dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); if (dst_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } nr_iovecs = 0; /* * Iterate through the dnodes again, this time filling in the uios * we allocated earlier. We also concatenate any data we want to * authenticate onto aadbuf. */ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; /* copy over the core fields and blkptrs (kept as plaintext) */ memcpy(&ddnp[i], dnp, (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); } /* * Handle authenticated data. We authenticate everything in * the dnode that can be brought over when we do a raw send. * This includes all of the core fields as well as the MACs * stored in the bp checksums and all of the portable bits * from blk_prop. We include the dnode padding here in case it * ever gets used in the future. Some dn_flags and dn_used are * not portable so we mask those out values out of the * authenticated data. */ crypt_len = offsetof(dnode_phys_t, dn_blkptr); memcpy(aadp, dnp, crypt_len); adnp = (dnode_phys_t *)aadp; adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; aadp += crypt_len; aad_len += crypt_len; for (j = 0; j < dnp->dn_nblkptr; j++) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, &dnp->dn_blkptr[j]); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, DN_SPILL_BLKPTR(dnp)); } /* * If this bonus buffer needs to be encrypted, we prepare an * iovec_t. The encryption / decryption functions will fill * this in for us with the encrypted or decrypted data. * Otherwise we add the bonus buffer to the authenticated * data buffer and copy it over to the destination. The * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that * we can guarantee alignment with the AES block size * (128 bits). */ crypt_len = DN_MAX_BONUS_LEN(dnp); if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { ASSERT3U(nr_iovecs, <, nr_src); ASSERT3U(nr_iovecs, <, nr_dst); ASSERT3P(src_iovecs, !=, NULL); ASSERT3P(dst_iovecs, !=, NULL); src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } else { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len); memcpy(aadp, DN_BONUS(dnp), crypt_len); aadp += crypt_len; aad_len += crypt_len; } } *no_crypt = (nr_iovecs == 0); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; if (encrypt) { puio->uio_iov = src_iovecs; puio->uio_iovcnt = nr_src; cuio->uio_iov = dst_iovecs; cuio->uio_iovcnt = nr_dst; } else { puio->uio_iov = dst_iovecs; puio->uio_iovcnt = nr_dst; cuio->uio_iov = src_iovecs; cuio->uio_iovcnt = nr_src; } return (0); error: zio_buf_free(aadbuf, datalen); if (src_iovecs != NULL) kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); if (dst_iovecs != NULL) kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); *enc_len = 0; *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } static int zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len) { (void) encrypt; int ret; uint_t nr_plain = 1, nr_cipher = 2; iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; /* allocate the iovecs for the plain and cipher data */ plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t), KM_SLEEP); if (!plain_iovecs) { ret = SET_ERROR(ENOMEM); goto error; } cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), KM_SLEEP); if (!cipher_iovecs) { ret = SET_ERROR(ENOMEM); goto error; } plain_iovecs[0].iov_base = plainbuf; plain_iovecs[0].iov_len = datalen; cipher_iovecs[0].iov_base = cipherbuf; cipher_iovecs[0].iov_len = datalen; *enc_len = datalen; puio->uio_iov = plain_iovecs; puio->uio_iovcnt = nr_plain; cuio->uio_iov = cipher_iovecs; cuio->uio_iovcnt = nr_cipher; return (0); error: if (plain_iovecs != NULL) kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); if (cipher_iovecs != NULL) kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); *enc_len = 0; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } /* * This function builds up the plaintext (puio) and ciphertext (cuio) uios so * that they can be used for encryption and decryption by zio_do_crypt_uio(). * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks * requiring special handling to parse out pieces that are to be encrypted. The * authbuf is used by these special cases to store additional authenticated * data (AAD) for the encryption modes. */ static int zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; iovec_t *mac_iov; ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); /* route to handler */ switch (ot) { case DMU_OT_INTENT_LOG: ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; case DMU_OT_DNODE: ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; default: ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, datalen, puio, cuio, enc_len); *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; break; } if (ret != 0) goto error; /* populate the uios */ puio->uio_segflg = UIO_SYSSPACE; cuio->uio_segflg = UIO_SYSSPACE; mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); mac_iov->iov_base = mac; mac_iov->iov_len = ZIO_DATA_MAC_LEN; return (0); error: return (ret); } /* * Primary encryption / decryption entrypoint for zio data. */ int zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, boolean_t *no_crypt) { int ret; boolean_t locked = B_FALSE; uint64_t crypt = key->zk_crypt; uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; uint_t enc_len, auth_len; zfs_uio_t puio, cuio; uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; crypto_key_t tmp_ckey, *ckey = NULL; crypto_ctx_template_t tmpl; uint8_t *authbuf = NULL; memset(&puio, 0, sizeof (puio)); memset(&cuio, 0, sizeof (cuio)); /* * If the needed key is the current one, just use it. Otherwise we * need to generate a temporary one from the given salt + master key. * If we are encrypting, we must return a copy of the current salt * so that it can be stored in the blkptr_t. */ rw_enter(&key->zk_salt_lock, RW_READER); locked = B_TRUE; if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { ckey = &key->zk_current_key; tmpl = key->zk_current_tmpl; } else { rw_exit(&key->zk_salt_lock); locked = B_FALSE; ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); if (ret != 0) goto error; tmp_ckey.ck_data = enc_keydata; tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); ckey = &tmp_ckey; tmpl = NULL; } /* * Attempt to use QAT acceleration if we can. We currently don't * do this for metadnode and ZIL blocks, since they have a much * more involved buffer layout and the qat_crypt() function only * works in-place. */ if (qat_crypt_use_accel(datalen) && ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) { uint8_t *srcbuf, *dstbuf; if (encrypt) { srcbuf = plainbuf; dstbuf = cipherbuf; } else { srcbuf = cipherbuf; dstbuf = plainbuf; } ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf, dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen); if (ret == CPA_STATUS_SUCCESS) { if (locked) { rw_exit(&key->zk_salt_lock); locked = B_FALSE; } return (0); } /* If the hardware implementation fails fall back to software */ } /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, &authbuf, &auth_len, no_crypt); if (ret != 0) goto error; /* perform the encryption / decryption in software */ ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len, &puio, &cuio, authbuf, auth_len); if (ret != 0) goto error; if (locked) { rw_exit(&key->zk_salt_lock); } if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (0); error: if (locked) rw_exit(&key->zk_salt_lock); if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (ret); } /* * Simple wrapper around zio_do_crypt_data() to work with abd's instead of * linear buffers. */ int zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) { int ret; void *ptmp, *ctmp; if (encrypt) { ptmp = abd_borrow_buf_copy(pabd, datalen); ctmp = abd_borrow_buf(cabd, datalen); } else { ptmp = abd_borrow_buf(pabd, datalen); ctmp = abd_borrow_buf_copy(cabd, datalen); } ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, datalen, ptmp, ctmp, no_crypt); if (ret != 0) goto error; if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (0); error: if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (ret); } #if defined(_KERNEL) /* CSTYLED */ module_param(zfs_key_max_salt_uses, ulong, 0644); MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " "can be used for generating encryption keys before it is rotated"); #endif diff --git a/sys/contrib/openzfs/module/zfs/brt.c b/sys/contrib/openzfs/module/zfs/brt.c index b0529521ec76..759bc8d2e2b8 100644 --- a/sys/contrib/openzfs/module/zfs/brt.c +++ b/sys/contrib/openzfs/module/zfs/brt.c @@ -1,1753 +1,1753 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Block Cloning design. * * Block Cloning allows to manually clone a file (or a subset of its blocks) * into another (or the same) file by just creating additional references to * the data blocks without copying the data itself. Those references are kept * in the Block Reference Tables (BRTs). * * In many ways this is similar to the existing deduplication, but there are * some important differences: * * - Deduplication is automatic and Block Cloning is not - one has to use a * dedicated system call(s) to clone the given file/blocks. * - Deduplication keeps all data blocks in its table, even those referenced * just once. Block Cloning creates an entry in its tables only when there * are at least two references to the given data block. If the block was * never explicitly cloned or the second to last reference was dropped, * there will be neither space nor performance overhead. * - Deduplication needs data to work - one needs to pass real data to the * write(2) syscall, so hash can be calculated. Block Cloning doesn't require * data, just block pointers to the data, so it is extremely fast, as we pay * neither the cost of reading the data, nor the cost of writing the data - * we operate exclusively on metadata. * - If the D (dedup) bit is not set in the block pointer, it means that * the block is not in the dedup table (DDT) and we won't consult the DDT * when we need to free the block. Block Cloning must be consulted on every * free, because we cannot modify the source BP (eg. by setting something * similar to the D bit), thus we have no hint if the block is in the * Block Reference Table (BRT), so we need to look into the BRT. There is * an optimization in place that allows us to eliminate the majority of BRT * lookups which is described below in the "Minimizing free penalty" section. * - The BRT entry is much smaller than the DDT entry - for BRT we only store * 64bit offset and 64bit reference counter. * - Dedup keys are cryptographic hashes, so two blocks that are close to each * other on disk are most likely in totally different parts of the DDT. * The BRT entry keys are offsets into a single top-level VDEV, so data blocks * from one file should have BRT entries close to each other. * - Scrub will only do a single pass over a block that is referenced multiple * times in the DDT. Unfortunately it is not currently (if at all) possible * with Block Cloning and block referenced multiple times will be scrubbed * multiple times. The new, sorted scrub should be able to eliminate * duplicated reads given enough memory. * - Deduplication requires cryptographically strong hash as a checksum or * additional data verification. Block Cloning works with any checksum * algorithm or even with checksumming disabled. * * As mentioned above, the BRT entries are much smaller than the DDT entries. * To uniquely identify a block we just need its vdev id and offset. We also * need to maintain a reference counter. The vdev id will often repeat, as there * is a small number of top-level VDEVs and a large number of blocks stored in * each VDEV. We take advantage of that to reduce the BRT entry size further by * maintaining one BRT for each top-level VDEV, so we can then have only offset * and counter as the BRT entry. * * Minimizing free penalty. * * Block Cloning allows creating additional references to any existing block. * When we free a block there is no hint in the block pointer whether the block * was cloned or not, so on each free we have to check if there is a * corresponding entry in the BRT or not. If there is, we need to decrease * the reference counter. Doing BRT lookup on every free can potentially be * expensive by requiring additional I/Os if the BRT doesn't fit into memory. * This is the main problem with deduplication, so we've learned our lesson and * try not to repeat the same mistake here. How do we do that? We divide each * top-level VDEV into 16MB regions. For each region we maintain a counter that * is a sum of all the BRT entries that have offsets within the region. This * creates the entries count array of 16bit numbers for each top-level VDEV. * The entries count array is always kept in memory and updated on disk in the * same transaction group as the BRT updates to keep everything in-sync. We can * keep the array in memory, because it is very small. With 16MB regions and * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease * the region size even further in the future). Now, when we want to free * a block, we first consult the array. If the counter for the whole region is * zero, there is no need to look for the BRT entry, as there isn't one for * sure. If the counter for the region is greater than zero, only then we will * do a BRT lookup and if an entry is found we will decrease the reference * counter in the BRT entry and in the entry counters array. * * The entry counters array is small, but can potentially be larger for very * large VDEVs or smaller regions. In this case we don't want to rewrite entire * array on every change. We then divide the array into 32kB block and keep * a bitmap of dirty blocks within a transaction group. When we sync the * transaction group we can only update the parts of the entry counters array * that were modified. Note: Keeping track of the dirty parts of the entry * counters array is implemented, but updating only parts of the array on disk * is not yet implemented - for now we will update entire array if there was * any change. * * The implementation tries to be economic: if BRT is not used, or no longer * used, there will be no entries in the MOS and no additional memory used (eg. * the entry counters array is only allocated if needed). * * Interaction between Deduplication and Block Cloning. * * If both functionalities are in use, we could end up with a block that is * referenced multiple times in both DDT and BRT. When we free one of the * references we couldn't tell where it belongs, so we would have to decide * what table takes the precedence: do we first clear DDT references or BRT * references? To avoid this dilemma BRT cooperates with DDT - if a given block * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will * lookup DDT entry instead and increase the counter there. No BRT entry * will be created for a block which has the D (dedup) bit set. * BRT may be more efficient for manual deduplication, but if the block is * already in the DDT, then creating additional BRT entry would be less * efficient. This clever idea was proposed by Allan Jude. * * Block Cloning across datasets. * * Block Cloning is not limited to cloning blocks within the same dataset. * It is possible (and very useful) to clone blocks between different datasets. * One use case is recovering files from snapshots. By cloning the files into * dataset we need no additional storage. Without Block Cloning we would need * additional space for those files. * Another interesting use case is moving the files between datasets * (copying the file content to the new dataset and removing the source file). * In that case Block Cloning will only be used briefly, because the BRT entries * will be removed when the source is removed. * Note: currently it is not possible to clone blocks between encrypted * datasets, even if those datasets use the same encryption key (this includes * snapshots of encrypted datasets). Cloning blocks between datasets that use * the same keys should be possible and should be implemented in the future. * * Block Cloning flow through ZFS layers. * * Note: Block Cloning can be used both for cloning file system blocks and ZVOL * blocks. As of this writing no interface is implemented that allows for block * cloning within a ZVOL. * FreeBSD and Linux provides copy_file_range(2) system call and we will use it * for blocking cloning. * * ssize_t * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, * size_t len, unsigned int flags); * * Even though offsets and length represent bytes, they have to be * block-aligned or we will return an error so the upper layer can * fallback to the generic mechanism that will just copy the data. * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. * This function was implemented based on zfs_write(), but instead of writing * the given data we first read block pointers using the new dmu_read_l0_bps() * function from the source file. Once we have BPs from the source file we call * the dmu_brt_clone() function on the destination file. This function * allocates BPs for us. We iterate over all source BPs. If the given BP is * a hole or an embedded block, we just copy BP as-is. If it points to a real * data we place this BP on a BRT pending list using the brt_pending_add() * function. * * We use this pending list to keep track of all BPs that got new references * within this transaction group. * * Some special cases to consider and how we address them: * - The block we want to clone may have been created within the same * transaction group that we are trying to clone. Such block has no BP * allocated yet, so cannot be immediately cloned. We return EAGAIN. * - The block we want to clone may have been modified within the same * transaction group. We return EAGAIN. * - A block may be cloned multiple times during one transaction group (that's * why pending list is actually a tree and not an append-only list - this * way we can figure out faster if this block is cloned for the first time * in this txg or consecutive time). * - A block may be cloned and freed within the same transaction group * (see dbuf_undirty()). * - A block may be cloned and within the same transaction group the clone * can be cloned again (see dmu_read_l0_bps()). * - A file might have been deleted, but the caller still has a file descriptor * open to this file and clones it. * * When we free a block we have an additional step in the ZIO pipeline where we * call the zio_brt_free() function. We then call the brt_entry_decref() * that loads the corresponding BRT entry (if one exists) and decreases * reference counter. If this is not the last reference we will stop ZIO * pipeline here. If this is the last reference or the block is not in the * BRT, we continue the pipeline and free the block as usual. * * At the beginning of spa_sync() where there can be no more block cloning, * but before issuing frees we call brt_pending_apply(). This function applies * all the new clones to the BRT table - we load BRT entries and update * reference counters. To sync new BRT entries to disk, we use brt_sync() * function. This function will sync all dirty per-top-level-vdev BRTs, * the entry counters arrays, etc. * * Block Cloning and ZIL. * * Every clone operation is divided into chunks (similar to write) and each * chunk is cloned in a separate transaction. The chunk size is determined by * how many BPs we can fit into a single ZIL entry. * Replaying clone operation is different from the regular clone operation, * as when we log clone operations we cannot use the source object - it may * reside on a different dataset, so we log BPs we want to clone. * The ZIL is replayed when we mount the given dataset, not when the pool is * imported. Taking this into account it is possible that the pool is imported * without mounting datasets and the source dataset is destroyed before the * destination dataset is mounted and its ZIL replayed. * To address this situation we leverage zil_claim() mechanism where ZFS will * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE - * entries, we will bump reference counters for their BPs in the BRT and then - * on mount and ZIL replay we will just attach BPs to the file without - * bumping reference counters. - * Note it is still possible that after zil_claim() we never mount the - * destination, so we never replay its ZIL and we destroy it. This way we would - * end up with leaked references in BRT. We address that too as ZFS gives us - * a chance to clean this up on dataset destroy (see zil_free_clone_range()). + * entries, we will bump reference counters for their BPs in the BRT. Then + * on mount and ZIL replay we bump the reference counters once more, while the + * first references are dropped during ZIL destroy by zil_free_clone_range(). + * It is possible that after zil_claim() we never mount the destination, so + * we never replay its ZIL and just destroy it. In this case the only taken + * references will be dropped by zil_free_clone_range(), since the cloning is + * not going to ever take place. */ static kmem_cache_t *brt_entry_cache; static kmem_cache_t *brt_pending_entry_cache; /* * Enable/disable prefetching of BRT entries that we are going to modify. */ int zfs_brt_prefetch = 1; #ifdef ZFS_DEBUG #define BRT_DEBUG(...) do { \ if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ } \ } while (0) #else #define BRT_DEBUG(...) do { } while (0) #endif int brt_zap_leaf_blockshift = 12; int brt_zap_indirect_blockshift = 12; static kstat_t *brt_ksp; typedef struct brt_stats { kstat_named_t brt_addref_entry_in_memory; kstat_named_t brt_addref_entry_not_on_disk; kstat_named_t brt_addref_entry_on_disk; kstat_named_t brt_addref_entry_read_lost_race; kstat_named_t brt_decref_entry_in_memory; kstat_named_t brt_decref_entry_loaded_from_disk; kstat_named_t brt_decref_entry_not_in_memory; kstat_named_t brt_decref_entry_not_on_disk; kstat_named_t brt_decref_entry_read_lost_race; kstat_named_t brt_decref_entry_still_referenced; kstat_named_t brt_decref_free_data_later; kstat_named_t brt_decref_free_data_now; kstat_named_t brt_decref_no_entry; } brt_stats_t; static brt_stats_t brt_stats = { { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, { "decref_free_data_later", KSTAT_DATA_UINT64 }, { "decref_free_data_now", KSTAT_DATA_UINT64 }, { "decref_no_entry", KSTAT_DATA_UINT64 } }; struct { wmsum_t brt_addref_entry_in_memory; wmsum_t brt_addref_entry_not_on_disk; wmsum_t brt_addref_entry_on_disk; wmsum_t brt_addref_entry_read_lost_race; wmsum_t brt_decref_entry_in_memory; wmsum_t brt_decref_entry_loaded_from_disk; wmsum_t brt_decref_entry_not_in_memory; wmsum_t brt_decref_entry_not_on_disk; wmsum_t brt_decref_entry_read_lost_race; wmsum_t brt_decref_entry_still_referenced; wmsum_t brt_decref_free_data_later; wmsum_t brt_decref_free_data_now; wmsum_t brt_decref_no_entry; } brt_sums; #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) static int brt_entry_compare(const void *x1, const void *x2); static int brt_pending_entry_compare(const void *x1, const void *x2); static void brt_rlock(brt_t *brt) { rw_enter(&brt->brt_lock, RW_READER); } static void brt_wlock(brt_t *brt) { rw_enter(&brt->brt_lock, RW_WRITER); } static void brt_unlock(brt_t *brt) { rw_exit(&brt->brt_lock); } static uint16_t brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) { ASSERT3U(idx, <, brtvd->bv_size); if (brtvd->bv_need_byteswap) { return (BSWAP_16(brtvd->bv_entcount[idx])); } else { return (brtvd->bv_entcount[idx]); } } static void brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) { ASSERT3U(idx, <, brtvd->bv_size); if (brtvd->bv_need_byteswap) { brtvd->bv_entcount[idx] = BSWAP_16(entcnt); } else { brtvd->bv_entcount[idx] = entcnt; } } static void brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) { uint16_t entcnt; ASSERT3U(idx, <, brtvd->bv_size); entcnt = brt_vdev_entcount_get(brtvd, idx); ASSERT(entcnt < UINT16_MAX); brt_vdev_entcount_set(brtvd, idx, entcnt + 1); } static void brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) { uint16_t entcnt; ASSERT3U(idx, <, brtvd->bv_size); entcnt = brt_vdev_entcount_get(brtvd, idx); ASSERT(entcnt > 0); brt_vdev_entcount_set(brtvd, idx, entcnt - 1); } #ifdef ZFS_DEBUG static void brt_vdev_dump(brt_t *brt) { brt_vdev_t *brtvd; uint64_t vdevid; if ((zfs_flags & ZFS_DEBUG_BRT) == 0) { return; } if (brt->brt_nvdevs == 0) { zfs_dbgmsg("BRT empty"); return; } zfs_dbgmsg("BRT vdev dump:"); for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { uint64_t idx; brtvd = &brt->brt_vdevs[vdevid]; zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d " "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid, brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, (u_longlong_t)brtvd->bv_size, (u_longlong_t)brtvd->bv_totalcount, (u_longlong_t)brtvd->bv_nblocks, (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); if (brtvd->bv_totalcount > 0) { zfs_dbgmsg(" entcounts:"); for (idx = 0; idx < brtvd->bv_size; idx++) { if (brt_vdev_entcount_get(brtvd, idx) > 0) { zfs_dbgmsg(" [%04llu] %hu", (u_longlong_t)idx, brt_vdev_entcount_get(brtvd, idx)); } } } if (brtvd->bv_entcount_dirty) { char *bitmap; bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); for (idx = 0; idx < brtvd->bv_nblocks; idx++) { bitmap[idx] = BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; } bitmap[idx] = '\0'; zfs_dbgmsg(" bitmap: %s", bitmap); kmem_free(bitmap, brtvd->bv_nblocks + 1); } } } #endif static brt_vdev_t * brt_vdev(brt_t *brt, uint64_t vdevid) { brt_vdev_t *brtvd; ASSERT(RW_LOCK_HELD(&brt->brt_lock)); if (vdevid < brt->brt_nvdevs) { brtvd = &brt->brt_vdevs[vdevid]; } else { brtvd = NULL; } return (brtvd); } static void brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; ASSERT(RW_WRITE_HELD(&brt->brt_lock)); ASSERT0(brtvd->bv_mos_brtvdev); ASSERT0(brtvd->bv_mos_entries); ASSERT(brtvd->bv_entcount != NULL); ASSERT(brtvd->bv_size > 0); ASSERT(brtvd->bv_bitmap != NULL); ASSERT(brtvd->bv_nblocks > 0); brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE, 0, tx); VERIFY(brtvd->bv_mos_entries != 0); BRT_DEBUG("MOS entries created, object=%llu", (u_longlong_t)brtvd->bv_mos_entries); /* * We allocate DMU buffer to store the bv_entcount[] array. * We will keep array size (bv_size) and cummulative count for all * bv_entcount[]s (bv_totalcount) in the bonus buffer. */ brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); VERIFY(brtvd->bv_mos_brtvdev != 0); BRT_DEBUG("MOS BRT VDEV created, object=%llu", (u_longlong_t)brtvd->bv_mos_brtvdev); snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); BRT_DEBUG("Pool directory object created, object=%s", name); spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); } static void brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) { vdev_t *vd; uint16_t *entcount; ulong_t *bitmap; uint64_t nblocks, size; ASSERT(RW_WRITE_HELD(&brt->brt_lock)); spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); if (!brtvd->bv_initiated) { ASSERT0(brtvd->bv_size); ASSERT(brtvd->bv_entcount == NULL); ASSERT(brtvd->bv_bitmap == NULL); ASSERT0(brtvd->bv_nblocks); avl_create(&brtvd->bv_tree, brt_entry_compare, sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); } else { ASSERT(brtvd->bv_size > 0); ASSERT(brtvd->bv_entcount != NULL); ASSERT(brtvd->bv_bitmap != NULL); ASSERT(brtvd->bv_nblocks > 0); /* * TODO: Allow vdev shrinking. We only need to implement * shrinking the on-disk BRT VDEV object. * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, * size, tx); */ ASSERT3U(brtvd->bv_size, <=, size); memcpy(entcount, brtvd->bv_entcount, sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), BT_SIZEOFMAP(brtvd->bv_nblocks))); vmem_free(brtvd->bv_entcount, sizeof (entcount[0]) * brtvd->bv_size); kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); } brtvd->bv_size = size; brtvd->bv_entcount = entcount; brtvd->bv_bitmap = bitmap; brtvd->bv_nblocks = nblocks; if (!brtvd->bv_initiated) { brtvd->bv_need_byteswap = FALSE; brtvd->bv_initiated = TRUE; BRT_DEBUG("BRT VDEV %llu initiated.", (u_longlong_t)brtvd->bv_vdevid); } } static void brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) { char name[64]; dmu_buf_t *db; brt_vdev_phys_t *bvphys; int error; snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); if (error != 0) return; ASSERT(brtvd->bv_mos_brtvdev != 0); error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); ASSERT0(error); if (error != 0) return; bvphys = db->db_data; if (brt->brt_rangesize == 0) { brt->brt_rangesize = bvphys->bvp_rangesize; } else { ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); } ASSERT(!brtvd->bv_initiated); brt_vdev_realloc(brt, brtvd); /* TODO: We don't support VDEV shrinking. */ ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); /* * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. */ error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), brtvd->bv_entcount, DMU_READ_NO_PREFETCH); ASSERT0(error); brtvd->bv_mos_entries = bvphys->bvp_mos_entries; ASSERT(brtvd->bv_mos_entries != 0); brtvd->bv_need_byteswap = (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); brtvd->bv_totalcount = bvphys->bvp_totalcount; brtvd->bv_usedspace = bvphys->bvp_usedspace; brtvd->bv_savedspace = bvphys->bvp_savedspace; brt->brt_usedspace += brtvd->bv_usedspace; brt->brt_savedspace += brtvd->bv_savedspace; dmu_buf_rele(db, FTAG); BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", name, (u_longlong_t)brtvd->bv_mos_brtvdev, (u_longlong_t)brtvd->bv_mos_entries); } static void brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) { ASSERT(RW_WRITE_HELD(&brt->brt_lock)); ASSERT(brtvd->bv_initiated); vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); brtvd->bv_entcount = NULL; kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); brtvd->bv_bitmap = NULL; ASSERT0(avl_numnodes(&brtvd->bv_tree)); avl_destroy(&brtvd->bv_tree); brtvd->bv_size = 0; brtvd->bv_nblocks = 0; brtvd->bv_initiated = FALSE; BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); } static void brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; uint64_t count; dmu_buf_t *db; brt_vdev_phys_t *bvphys; ASSERT(RW_WRITE_HELD(&brt->brt_lock)); ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(brtvd->bv_mos_entries != 0); VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); VERIFY0(count); VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); BRT_DEBUG("MOS entries destroyed, object=%llu", (u_longlong_t)brtvd->bv_mos_entries); brtvd->bv_mos_entries = 0; VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); bvphys = db->db_data; ASSERT0(bvphys->bvp_totalcount); ASSERT0(bvphys->bvp_usedspace); ASSERT0(bvphys->bvp_savedspace); dmu_buf_rele(db, FTAG); VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", (u_longlong_t)brtvd->bv_mos_brtvdev); brtvd->bv_mos_brtvdev = 0; snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); BRT_DEBUG("Pool directory object removed, object=%s", name); brt_vdev_dealloc(brt, brtvd); spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); } static void brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) { brt_vdev_t *brtvd, *vdevs; uint64_t vdevid; ASSERT(RW_WRITE_HELD(&brt->brt_lock)); ASSERT3U(nvdevs, >, brt->brt_nvdevs); vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); if (brt->brt_nvdevs > 0) { ASSERT(brt->brt_vdevs != NULL); memcpy(vdevs, brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); } for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { brtvd = &vdevs[vdevid]; brtvd->bv_vdevid = vdevid; brtvd->bv_initiated = FALSE; } BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); brt->brt_vdevs = vdevs; brt->brt_nvdevs = nvdevs; } static boolean_t brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) { uint64_t idx; ASSERT(RW_LOCK_HELD(&brt->brt_lock)); idx = bre->bre_offset / brt->brt_rangesize; if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { /* VDEV wasn't expanded. */ return (brt_vdev_entcount_get(brtvd, idx) > 0); } return (FALSE); } static void brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, uint64_t dsize) { uint64_t idx; ASSERT(RW_LOCK_HELD(&brt->brt_lock)); ASSERT(brtvd != NULL); ASSERT(brtvd->bv_entcount != NULL); brt->brt_savedspace += dsize; brtvd->bv_savedspace += dsize; brtvd->bv_meta_dirty = TRUE; if (bre->bre_refcount > 1) { return; } brt->brt_usedspace += dsize; brtvd->bv_usedspace += dsize; idx = bre->bre_offset / brt->brt_rangesize; if (idx >= brtvd->bv_size) { /* VDEV has been expanded. */ brt_vdev_realloc(brt, brtvd); } ASSERT3U(idx, <, brtvd->bv_size); brtvd->bv_totalcount++; brt_vdev_entcount_inc(brtvd, idx); brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); #ifdef ZFS_DEBUG brt_vdev_dump(brt); #endif } static void brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, uint64_t dsize) { uint64_t idx; ASSERT(RW_WRITE_HELD(&brt->brt_lock)); ASSERT(brtvd != NULL); ASSERT(brtvd->bv_entcount != NULL); brt->brt_savedspace -= dsize; brtvd->bv_savedspace -= dsize; brtvd->bv_meta_dirty = TRUE; if (bre->bre_refcount > 0) { return; } brt->brt_usedspace -= dsize; brtvd->bv_usedspace -= dsize; idx = bre->bre_offset / brt->brt_rangesize; ASSERT3U(idx, <, brtvd->bv_size); ASSERT(brtvd->bv_totalcount > 0); brtvd->bv_totalcount--; brt_vdev_entcount_dec(brtvd, idx); brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); #ifdef ZFS_DEBUG brt_vdev_dump(brt); #endif } static void brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) { dmu_buf_t *db; brt_vdev_phys_t *bvphys; ASSERT(brtvd->bv_meta_dirty); ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); if (brtvd->bv_entcount_dirty) { /* * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. */ dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), brtvd->bv_entcount, tx); memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); brtvd->bv_entcount_dirty = FALSE; } dmu_buf_will_dirty(db, tx); bvphys = db->db_data; bvphys->bvp_mos_entries = brtvd->bv_mos_entries; bvphys->bvp_size = brtvd->bv_size; if (brtvd->bv_need_byteswap) { bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; } else { bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; } bvphys->bvp_totalcount = brtvd->bv_totalcount; bvphys->bvp_rangesize = brt->brt_rangesize; bvphys->bvp_usedspace = brtvd->bv_usedspace; bvphys->bvp_savedspace = brtvd->bv_savedspace; dmu_buf_rele(db, FTAG); brtvd->bv_meta_dirty = FALSE; } static void brt_vdevs_alloc(brt_t *brt, boolean_t load) { brt_vdev_t *brtvd; uint64_t vdevid; brt_wlock(brt); brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); if (load) { for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { brtvd = &brt->brt_vdevs[vdevid]; ASSERT(brtvd->bv_entcount == NULL); brt_vdev_load(brt, brtvd); } } if (brt->brt_rangesize == 0) { brt->brt_rangesize = BRT_RANGESIZE; } brt_unlock(brt); } static void brt_vdevs_free(brt_t *brt) { brt_vdev_t *brtvd; uint64_t vdevid; brt_wlock(brt); for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { brtvd = &brt->brt_vdevs[vdevid]; if (brtvd->bv_initiated) brt_vdev_dealloc(brt, brtvd); } kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); brt_unlock(brt); } static void brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) { bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); bre->bre_refcount = 0; *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); } static int brt_entry_compare(const void *x1, const void *x2) { const brt_entry_t *bre1 = x1; const brt_entry_t *bre2 = x2; return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); } static int brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) { uint64_t mos_entries; uint64_t one, physsize; int error; ASSERT(RW_LOCK_HELD(&brt->brt_lock)); if (!brt_vdev_lookup(brt, brtvd, bre)) return (SET_ERROR(ENOENT)); /* * Remember mos_entries object number. After we reacquire the BRT lock, * the brtvd pointer may be invalid. */ mos_entries = brtvd->bv_mos_entries; if (mos_entries == 0) return (SET_ERROR(ENOENT)); brt_unlock(brt); error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, BRT_KEY_WORDS, &one, &physsize); if (error == 0) { ASSERT3U(one, ==, 1); ASSERT3U(physsize, ==, sizeof (bre->bre_refcount)); error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount); BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu " "count=%llu error=%d", (u_longlong_t)mos_entries, (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error); } brt_wlock(brt); return (error); } static void brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) { brt_vdev_t *brtvd; uint64_t mos_entries = 0; brt_rlock(brt); brtvd = brt_vdev(brt, vdevid); if (brtvd != NULL) mos_entries = brtvd->bv_mos_entries; brt_unlock(brt); if (mos_entries == 0) return; BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu", (u_longlong_t)mos_entries, (u_longlong_t)vdevid, (u_longlong_t)bre->bre_offset); (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); } static int brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) { int error; ASSERT(RW_LOCK_HELD(&brt->brt_lock)); ASSERT(brtvd->bv_mos_entries != 0); ASSERT(bre->bre_refcount > 0); error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries, (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount, tx); BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu " "error=%d", (u_longlong_t)brtvd->bv_mos_entries, (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, (u_longlong_t)bre->bre_refcount, error); return (error); } static int brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) { int error; ASSERT(RW_LOCK_HELD(&brt->brt_lock)); ASSERT(brtvd->bv_mos_entries != 0); ASSERT0(bre->bre_refcount); error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries, (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx); BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu " "error=%d", (u_longlong_t)brtvd->bv_mos_entries, (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, (u_longlong_t)bre->bre_refcount, error); return (error); } /* * Return TRUE if we _can_ have BRT entry for this bp. It might be false * positive, but gives us quick answer if we should look into BRT, which * may require reads and thus will be more expensive. */ boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp) { brt_t *brt = spa->spa_brt; brt_vdev_t *brtvd; brt_entry_t bre_search; boolean_t mayexists = FALSE; uint64_t vdevid; brt_entry_fill(bp, &bre_search, &vdevid); brt_rlock(brt); brtvd = brt_vdev(brt, vdevid); if (brtvd != NULL && brtvd->bv_initiated) { if (!avl_is_empty(&brtvd->bv_tree) || brt_vdev_lookup(brt, brtvd, &bre_search)) { mayexists = TRUE; } } brt_unlock(brt); return (mayexists); } uint64_t brt_get_dspace(spa_t *spa) { brt_t *brt = spa->spa_brt; if (brt == NULL) return (0); return (brt->brt_savedspace); } uint64_t brt_get_used(spa_t *spa) { brt_t *brt = spa->spa_brt; if (brt == NULL) return (0); return (brt->brt_usedspace); } uint64_t brt_get_saved(spa_t *spa) { brt_t *brt = spa->spa_brt; if (brt == NULL) return (0); return (brt->brt_savedspace); } uint64_t brt_get_ratio(spa_t *spa) { brt_t *brt = spa->spa_brt; if (brt->brt_usedspace == 0) return (100); return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / brt->brt_usedspace); } static int brt_kstats_update(kstat_t *ksp, int rw) { brt_stats_t *bs = ksp->ks_data; if (rw == KSTAT_WRITE) return (EACCES); bs->brt_addref_entry_in_memory.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_in_memory); bs->brt_addref_entry_not_on_disk.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); bs->brt_addref_entry_on_disk.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_on_disk); bs->brt_addref_entry_read_lost_race.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); bs->brt_decref_entry_in_memory.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_in_memory); bs->brt_decref_entry_loaded_from_disk.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); bs->brt_decref_entry_not_in_memory.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); bs->brt_decref_entry_not_on_disk.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); bs->brt_decref_entry_read_lost_race.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); bs->brt_decref_entry_still_referenced.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_still_referenced); bs->brt_decref_free_data_later.value.ui64 = wmsum_value(&brt_sums.brt_decref_free_data_later); bs->brt_decref_free_data_now.value.ui64 = wmsum_value(&brt_sums.brt_decref_free_data_now); bs->brt_decref_no_entry.value.ui64 = wmsum_value(&brt_sums.brt_decref_no_entry); return (0); } static void brt_stat_init(void) { wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); wmsum_init(&brt_sums.brt_decref_free_data_later, 0); wmsum_init(&brt_sums.brt_decref_free_data_now, 0); wmsum_init(&brt_sums.brt_decref_no_entry, 0); brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (brt_ksp != NULL) { brt_ksp->ks_data = &brt_stats; brt_ksp->ks_update = brt_kstats_update; kstat_install(brt_ksp); } } static void brt_stat_fini(void) { if (brt_ksp != NULL) { kstat_delete(brt_ksp); brt_ksp = NULL; } wmsum_fini(&brt_sums.brt_addref_entry_in_memory); wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); wmsum_fini(&brt_sums.brt_addref_entry_on_disk); wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); wmsum_fini(&brt_sums.brt_decref_entry_in_memory); wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); wmsum_fini(&brt_sums.brt_decref_free_data_later); wmsum_fini(&brt_sums.brt_decref_free_data_now); wmsum_fini(&brt_sums.brt_decref_no_entry); } void brt_init(void) { brt_entry_cache = kmem_cache_create("brt_entry_cache", sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); brt_stat_init(); } void brt_fini(void) { brt_stat_fini(); kmem_cache_destroy(brt_entry_cache); kmem_cache_destroy(brt_pending_entry_cache); } static brt_entry_t * brt_entry_alloc(const brt_entry_t *bre_init) { brt_entry_t *bre; bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); bre->bre_offset = bre_init->bre_offset; bre->bre_refcount = bre_init->bre_refcount; return (bre); } static void brt_entry_free(brt_entry_t *bre) { kmem_cache_free(brt_entry_cache, bre); } static void brt_entry_addref(brt_t *brt, const blkptr_t *bp) { brt_vdev_t *brtvd; brt_entry_t *bre, *racebre; brt_entry_t bre_search; avl_index_t where; uint64_t vdevid; int error; ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); brt_entry_fill(bp, &bre_search, &vdevid); brt_wlock(brt); brtvd = brt_vdev(brt, vdevid); if (brtvd == NULL) { ASSERT3U(vdevid, >=, brt->brt_nvdevs); /* New VDEV was added. */ brt_vdevs_expand(brt, vdevid + 1); brtvd = brt_vdev(brt, vdevid); } ASSERT(brtvd != NULL); if (!brtvd->bv_initiated) brt_vdev_realloc(brt, brtvd); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre != NULL) { BRTSTAT_BUMP(brt_addref_entry_in_memory); } else { /* * brt_entry_lookup() may drop the BRT (read) lock and * reacquire it (write). */ error = brt_entry_lookup(brt, brtvd, &bre_search); /* bre_search now contains correct bre_refcount */ ASSERT(error == 0 || error == ENOENT); if (error == 0) BRTSTAT_BUMP(brt_addref_entry_on_disk); else BRTSTAT_BUMP(brt_addref_entry_not_on_disk); /* * When the BRT lock was dropped, brt_vdevs[] may have been * expanded and reallocated, we need to update brtvd's pointer. */ brtvd = brt_vdev(brt, vdevid); ASSERT(brtvd != NULL); racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); if (racebre == NULL) { bre = brt_entry_alloc(&bre_search); ASSERT(RW_WRITE_HELD(&brt->brt_lock)); avl_insert(&brtvd->bv_tree, bre, where); brt->brt_nentries++; } else { /* * The entry was added when the BRT lock was dropped in * brt_entry_lookup(). */ BRTSTAT_BUMP(brt_addref_entry_read_lost_race); bre = racebre; } } bre->bre_refcount++; brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); brt_unlock(brt); } /* Return TRUE if block should be freed immediately. */ boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp) { brt_t *brt = spa->spa_brt; brt_vdev_t *brtvd; brt_entry_t *bre, *racebre; brt_entry_t bre_search; avl_index_t where; uint64_t vdevid; int error; brt_entry_fill(bp, &bre_search, &vdevid); brt_wlock(brt); brtvd = brt_vdev(brt, vdevid); ASSERT(brtvd != NULL); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre != NULL) { BRTSTAT_BUMP(brt_decref_entry_in_memory); goto out; } else { BRTSTAT_BUMP(brt_decref_entry_not_in_memory); } /* * brt_entry_lookup() may drop the BRT lock and reacquire it. */ error = brt_entry_lookup(brt, brtvd, &bre_search); /* bre_search now contains correct bre_refcount */ ASSERT(error == 0 || error == ENOENT); /* * When the BRT lock was dropped, brt_vdevs[] may have been expanded * and reallocated, we need to update brtvd's pointer. */ brtvd = brt_vdev(brt, vdevid); ASSERT(brtvd != NULL); if (error == ENOENT) { BRTSTAT_BUMP(brt_decref_entry_not_on_disk); bre = NULL; goto out; } racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); if (racebre != NULL) { /* * The entry was added when the BRT lock was dropped in * brt_entry_lookup(). */ BRTSTAT_BUMP(brt_decref_entry_read_lost_race); bre = racebre; goto out; } BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); bre = brt_entry_alloc(&bre_search); ASSERT(RW_WRITE_HELD(&brt->brt_lock)); avl_insert(&brtvd->bv_tree, bre, where); brt->brt_nentries++; out: if (bre == NULL) { /* * This is a free of a regular (not cloned) block. */ brt_unlock(brt); BRTSTAT_BUMP(brt_decref_no_entry); return (B_TRUE); } if (bre->bre_refcount == 0) { brt_unlock(brt); BRTSTAT_BUMP(brt_decref_free_data_now); return (B_TRUE); } ASSERT(bre->bre_refcount > 0); bre->bre_refcount--; if (bre->bre_refcount == 0) BRTSTAT_BUMP(brt_decref_free_data_later); else BRTSTAT_BUMP(brt_decref_entry_still_referenced); brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); brt_unlock(brt); return (B_FALSE); } uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) { brt_t *brt = spa->spa_brt; brt_vdev_t *brtvd; brt_entry_t bre_search, *bre; uint64_t vdevid, refcnt; int error; brt_entry_fill(bp, &bre_search, &vdevid); brt_rlock(brt); brtvd = brt_vdev(brt, vdevid); ASSERT(brtvd != NULL); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre == NULL) { error = brt_entry_lookup(brt, brtvd, &bre_search); ASSERT(error == 0 || error == ENOENT); if (error == ENOENT) refcnt = 0; else refcnt = bre_search.bre_refcount; } else refcnt = bre->bre_refcount; brt_unlock(brt); return (refcnt); } static void brt_prefetch(brt_t *brt, const blkptr_t *bp) { brt_entry_t bre; uint64_t vdevid; ASSERT(bp != NULL); if (!zfs_brt_prefetch) return; brt_entry_fill(bp, &bre, &vdevid); brt_entry_prefetch(brt, vdevid, &bre); } static int brt_pending_entry_compare(const void *x1, const void *x2) { const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; int cmp; cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2)); if (cmp == 0) { cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), DVA_GET_VDEV(&bp2->blk_dva[0])); if (cmp == 0) { cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), DVA_GET_OFFSET(&bp2->blk_dva[0])); } } return (cmp); } void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { brt_t *brt; avl_tree_t *pending_tree; kmutex_t *pending_lock; brt_pending_entry_t *bpe, *newbpe; avl_index_t where; uint64_t txg; brt = spa->spa_brt; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); newbpe->bpe_bp = *bp; newbpe->bpe_count = 1; mutex_enter(pending_lock); bpe = avl_find(pending_tree, newbpe, &where); if (bpe == NULL) { avl_insert(pending_tree, newbpe, where); newbpe = NULL; } else { bpe->bpe_count++; } mutex_exit(pending_lock); if (newbpe != NULL) { ASSERT(bpe != NULL); ASSERT(bpe != newbpe); kmem_cache_free(brt_pending_entry_cache, newbpe); } else { ASSERT(bpe == NULL); } /* Prefetch BRT entry, as we will need it in the syncing context. */ brt_prefetch(brt, bp); } void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { brt_t *brt; avl_tree_t *pending_tree; kmutex_t *pending_lock; brt_pending_entry_t *bpe, bpe_search; uint64_t txg; brt = spa->spa_brt; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; bpe_search.bpe_bp = *bp; mutex_enter(pending_lock); bpe = avl_find(pending_tree, &bpe_search, NULL); /* I believe we should always find bpe when this function is called. */ if (bpe != NULL) { ASSERT(bpe->bpe_count > 0); bpe->bpe_count--; if (bpe->bpe_count == 0) { avl_remove(pending_tree, bpe); kmem_cache_free(brt_pending_entry_cache, bpe); } } mutex_exit(pending_lock); } void brt_pending_apply(spa_t *spa, uint64_t txg) { brt_t *brt; brt_pending_entry_t *bpe; avl_tree_t *pending_tree; kmutex_t *pending_lock; void *c; ASSERT3U(txg, !=, 0); brt = spa->spa_brt; pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; mutex_enter(pending_lock); c = NULL; while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { boolean_t added_to_ddt; mutex_exit(pending_lock); for (int i = 0; i < bpe->bpe_count; i++) { /* * If the block has DEDUP bit set, it means that it * already exists in the DEDUP table, so we can just * use that instead of creating new entry in * the BRT table. */ if (BP_GET_DEDUP(&bpe->bpe_bp)) { added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); } else { added_to_ddt = B_FALSE; } if (!added_to_ddt) brt_entry_addref(brt, &bpe->bpe_bp); } kmem_cache_free(brt_pending_entry_cache, bpe); mutex_enter(pending_lock); } mutex_exit(pending_lock); } static void brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) { ASSERT(RW_WRITE_HELD(&brt->brt_lock)); ASSERT(brtvd->bv_mos_entries != 0); if (bre->bre_refcount == 0) { int error; error = brt_entry_remove(brt, brtvd, bre, tx); ASSERT(error == 0 || error == ENOENT); /* * If error == ENOENT then zfs_clone_range() was done from a * removed (but opened) file (open(), unlink()). */ ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT); } else { VERIFY0(brt_entry_update(brt, brtvd, bre, tx)); } } static void brt_sync_table(brt_t *brt, dmu_tx_t *tx) { brt_vdev_t *brtvd; brt_entry_t *bre; uint64_t vdevid; void *c; brt_wlock(brt); for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { brtvd = &brt->brt_vdevs[vdevid]; if (!brtvd->bv_initiated) continue; if (!brtvd->bv_meta_dirty) { ASSERT(!brtvd->bv_entcount_dirty); ASSERT0(avl_numnodes(&brtvd->bv_tree)); continue; } ASSERT(!brtvd->bv_entcount_dirty || avl_numnodes(&brtvd->bv_tree) != 0); if (brtvd->bv_mos_brtvdev == 0) brt_vdev_create(brt, brtvd, tx); c = NULL; while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { brt_sync_entry(brt, brtvd, bre, tx); brt_entry_free(bre); ASSERT(brt->brt_nentries > 0); brt->brt_nentries--; } brt_vdev_sync(brt, brtvd, tx); if (brtvd->bv_totalcount == 0) brt_vdev_destroy(brt, brtvd, tx); } ASSERT0(brt->brt_nentries); brt_unlock(brt); } void brt_sync(spa_t *spa, uint64_t txg) { dmu_tx_t *tx; brt_t *brt; ASSERT(spa_syncing_txg(spa) == txg); brt = spa->spa_brt; brt_rlock(brt); if (brt->brt_nentries == 0) { /* No changes. */ brt_unlock(brt); return; } brt_unlock(brt); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); brt_sync_table(brt, tx); dmu_tx_commit(tx); } static void brt_table_alloc(brt_t *brt) { for (int i = 0; i < TXG_SIZE; i++) { avl_create(&brt->brt_pending_tree[i], brt_pending_entry_compare, sizeof (brt_pending_entry_t), offsetof(brt_pending_entry_t, bpe_node)); mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, NULL); } } static void brt_table_free(brt_t *brt) { for (int i = 0; i < TXG_SIZE; i++) { ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); avl_destroy(&brt->brt_pending_tree[i]); mutex_destroy(&brt->brt_pending_lock[i]); } } static void brt_alloc(spa_t *spa) { brt_t *brt; ASSERT(spa->spa_brt == NULL); brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); brt->brt_spa = spa; brt->brt_rangesize = 0; brt->brt_nentries = 0; brt->brt_vdevs = NULL; brt->brt_nvdevs = 0; brt_table_alloc(brt); spa->spa_brt = brt; } void brt_create(spa_t *spa) { brt_alloc(spa); brt_vdevs_alloc(spa->spa_brt, B_FALSE); } int brt_load(spa_t *spa) { brt_alloc(spa); brt_vdevs_alloc(spa->spa_brt, B_TRUE); return (0); } void brt_unload(spa_t *spa) { brt_t *brt = spa->spa_brt; if (brt == NULL) return; brt_vdevs_free(brt); brt_table_free(brt); rw_destroy(&brt->brt_lock); kmem_free(brt, sizeof (*brt)); spa->spa_brt = NULL; } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW, "Enable prefetching of BRT entries"); #ifdef ZFS_BRT_DEBUG ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug"); #endif /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index a63aac51f225..e0cdd9e3f33e 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -1,2595 +1,2593 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif /* * Enable/disable nopwrite feature. */ static int zfs_nopwrite_enabled = 1; /* * Tunable to control percentage of dirtied L1 blocks from frees allowed into * one TXG. After this threshold is crossed, additional dirty blocks from frees * will wait until the next TXG. * A value of zero will disable this throttle. */ static uint_t zfs_per_txg_dirty_frees_percent = 30; /* * Enable/disable forcing txg sync when dirty checking for holes with lseek(). * By default this is enabled to ensure accurate hole reporting, it can result * in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads. * Disabling this option will result in holes never being reported in dirty * files which is always safe. */ static int zfs_dmu_offset_next_sync = 1; /* * Limit the amount we can prefetch with one call to this amount. This * helps to limit the amount of memory that can be used by prefetching. * Larger objects should be prefetched a bit at a time. */ #ifdef _ILP32 uint_t dmu_prefetch_max = 8 * 1024 * 1024; #else uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; #endif const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "object array" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "packed nvlist" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "packed nvlist size" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map" }, {DMU_BSWAP_UINT64, TRUE, FALSE, TRUE, "ZIL intent log" }, {DMU_BSWAP_DNODE, TRUE, FALSE, TRUE, "DMU dnode" }, {DMU_BSWAP_OBJSET, TRUE, TRUE, FALSE, "DMU objset" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL directory" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL directory child map"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset snap map" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL props" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL dataset" }, {DMU_BSWAP_ZNODE, TRUE, FALSE, FALSE, "ZFS znode" }, {DMU_BSWAP_OLDACL, TRUE, FALSE, TRUE, "ZFS V0 ACL" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "ZFS plain file" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS directory" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "ZFS master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS delete queue" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "zvol object" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "zvol prop" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "other uint8[]" }, {DMU_BSWAP_UINT64, FALSE, FALSE, TRUE, "other uint64[]" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "other ZAP" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "persistent error log" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "SPA history" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA history offsets" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "Pool properties" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL permissions" }, {DMU_BSWAP_ACL, TRUE, FALSE, TRUE, "ZFS ACL" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "ZFS SYSACL" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "FUID table" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "FUID table size" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset next clones"}, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan work queue" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project used" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project quota"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "snapshot refcount tags"}, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT ZAP algorithm" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT statistics" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "System attributes" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr registration" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr layouts" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan translations" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "deduplicated block" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL deadlist map" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL deadlist map hdr" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dir clones" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" } }; dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp) { uint64_t blkid; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (0); } int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (err); } int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_bonus_max(void) { return (DN_OLD_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else if (newsize < 0 || newsize > db_fake->db_size) { error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (!DMU_OT_IS_VALID(type)) { error = SET_ERROR(EINVAL); } else if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); dn = DB_DNODE(db); type = dn->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * Lookup and hold the bonus buffer for the provided dnode. If the dnode * has not yet been allocated a new bonus dbuf a will be allocated. * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp, uint32_t flags) { dmu_buf_impl_t *db; int error; uint32_t db_flags = DB_RF_MUST_SUCCEED; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); } if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (zfs_refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); atomic_inc_32(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); error = dbuf_read(db, NULL, db_flags); if (error) { dnode_evict_bonus(dn); dbuf_rele(db, tag); *dbp = NULL; return (error); } *dbp = &db->db; return (0); } int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else { dbuf_rele(db, tag); *dbp = NULL; } return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = SET_ERROR(EINVAL); } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = SET_ERROR(ENOENT); } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; uint32_t db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; zstream_t *zs = NULL; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio = NULL; boolean_t missed = B_FALSE; ASSERT(!read || length <= DMU_MAX_ACCESS); /* * Note: We directly notify the prefetch code of this read, so that * we can tell it about the multi-block read. dbuf_read() only knows * about the one block it is accessing. */ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; if ((flags & DMU_READ_NO_DECRYPT) != 0) dbuf_flags |= DB_RF_NO_DECRYPT; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); if (read) zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); if ((flags & DMU_READ_NO_PREFETCH) == 0) { /* * Prepare the zfetch before initiating the demand reads, so * that if multiple threads block on same indirect block, we * base predictions on the original less racy request order. */ zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read, B_TRUE); } for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { if (zs) dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); if (read) zio_nowait(zio); return (SET_ERROR(EIO)); } /* * Initiate async demand data read. * We check the db_state after calling dbuf_read() because * (1) dbuf_read() may change the state to CACHED due to a * hit in the ARC, and (2) on a cache miss, a child will * have been added to "zio" but not yet completed, so the * state will not yet be CACHED. */ if (read) { if (i == nblks - 1 && blkid + i < dn->dn_maxblkid && offset + length < db->db.db_offset + db->db.db_size) { if (offset <= db->db.db_offset) dbuf_flags |= DB_RF_PARTIAL_FIRST; else dbuf_flags |= DB_RF_PARTIAL_MORE; } (void) dbuf_read(db, zio, dbuf_flags); if (db->db_state != DB_CACHED) missed = B_TRUE; } dbp[i] = &db->db; } if (!read) zfs_racct_write(length, nblks); if (zs) dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); if (read) { /* wait for async read i/o */ err = zio_wait(zio); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /* * Issue prefetch I/Os for the given blocks. If level is greater than 0, the * indirect blocks prefetched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. If the range * it too long, prefetch the first dmu_prefetch_max bytes as requested, while * for the rest only a higher level, also fitting within dmu_prefetch_max. It * should primarily help random reads, since for long sequential reads there is * a speculative prefetcher. * * Note that if the indirect blocks above the blocks being prefetched are not * in cache, they will be asynchronously read in. Dnode read by dnode_hold() * is currently synchronous. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; int64_t level2 = level; uint64_t start, end, start2, end2; if (dmu_prefetch_max == 0 || len == 0) { dmu_prefetch_dnode(os, object, pri); return; } if (dnode_hold(os, object, FTAG, &dn) != 0) return; /* * Depending on len we may do two prefetches: blocks [start, end) at * level, and following blocks [start2, end2) at higher level2. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift != 0) { /* * The object has multiple blocks. Calculate the full range * of blocks [start, end2) and then split it into two parts, * so that the first [start, end) fits into dmu_prefetch_max. */ start = dbuf_whichblock(dn, level, offset); end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1; uint8_t ibs = dn->dn_indblkshift; uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs; uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs; start2 = end = MIN(end2, start + limit); /* * Find level2 where [start2, end2) fits into dmu_prefetch_max. */ uint8_t ibps = ibs - SPA_BLKPTRSHIFT; limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs; do { level2++; start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps; end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps; } while (end2 - start2 > limit); } else { /* There is only one block. Prefetch it or nothing. */ start = start2 = end2 = 0; end = start + (level == 0 && offset < dn->dn_datablksz); } for (uint64_t i = start; i < end; i++) dbuf_prefetch(dn, level, i, pri, 0); for (uint64_t i = start2; i < end2; i++) dbuf_prefetch(dn, level2, i, pri, 0); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* * Issue prefetch I/Os for the given object's dnode. */ void dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri) { if (object == 0 || object >= DN_MAX_OBJECT) return; dnode_t *dn = DMU_META_DNODE(os); rw_enter(&dn->dn_struct_rwlock, RW_READER); uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, 0, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first * offset that should be freed and l1blks is set to the number of level 1 * indirect blocks found within the chunk. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) { uint64_t blks; uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = (uint64_t)dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT3U(minimum, <=, *start); /* * Check if we can free the entire range assuming that all of the * L1 blocks in this range have data. If we can, we use this * worst case value as an estimate so we can avoid having to look * at the object's actual data. */ uint64_t total_l1blks = (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) / iblkrange; if (total_l1blks <= maxblks) { *l1blks = total_l1blks; *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); for (blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* * dnode_next_offset(BACKWARDS) will find an allocated L1 * indirect block at or before the input offset. We must * decrement *start so that it is at the end of the region * to search. */ (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { *start = minimum; break; } else if (err != 0) { *l1blks = blks; return (err); } /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); } if (*start < minimum) *start = minimum; *l1blks = blks; return (0); } /* * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set, * otherwise return false. * Used below in dmu_free_long_range_impl() to enable abort when unmounting */ static boolean_t dmu_objset_zfs_unmounting(objset_t *os) { #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) return (zfs_get_vfs_flag_unmounted(os)); #else (void) os; #endif return (B_FALSE); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length) { uint64_t object_size; int err; uint64_t dirty_frees_threshold; dsl_pool_t *dp = dmu_objset_pool(os); if (dn == NULL) return (SET_ERROR(EINVAL)); object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; if (offset >= object_size) return (0); if (zfs_per_txg_dirty_frees_percent <= 100) dirty_frees_threshold = zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; else dirty_frees_threshold = zfs_dirty_data_max / 20; if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin, chunk_len; uint64_t l1blks; dmu_tx_t *tx; if (dmu_objset_zfs_unmounting(dn->dn_objset)) return (SET_ERROR(EINTR)); chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ err = get_next_chunk(dn, &chunk_begin, offset, &l1blks); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); chunk_len = chunk_end - chunk_begin; tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); /* * Mark this transaction as typically resulting in a net * reduction in space used. */ dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } uint64_t txg = dmu_tx_get_txg(tx); mutex_enter(&dp->dp_lock); uint64_t long_free_dirty = dp->dp_long_free_dirty_pertxg[txg & TXG_MASK]; mutex_exit(&dp->dp_lock); /* * To avoid filling up a TXG with just frees, wait for * the next TXG to open before freeing more chunks if * we have reached the threshold of frees. */ if (dirty_frees_threshold != 0 && long_free_dirty >= dirty_frees_threshold) { DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay); dmu_tx_commit(tx); txg_wait_open(dp, 0, B_TRUE); continue; } /* * In order to prevent unnecessary write throttling, for each * TXG, we track the cumulative size of L1 blocks being dirtied * in dnode_free_range() below. We compare this number to a * tunable threshold, past which we prevent new L1 dirty freeing * blocks from being added into the open TXG. See * dmu_free_long_range_impl() for details. The threshold * prevents write throttle activation due to dirty freeing L1 * blocks taking up a large percentage of zfs_dirty_data_max. */ mutex_enter(&dp->dp_lock); dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] += l1blks << dn->dn_indblkshift; mutex_exit(&dp->dp_lock); DTRACE_PROBE3(free__long__range, uint64_t, long_free_dirty, uint64_t, chunk_len, uint64_t, txg); dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx); length -= chunk_len; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length); /* * It is important to zero out the maxblkid when freeing the entire * file, so that (a) subsequent calls to dmu_free_long_range_impl() * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); return (err); } int dmu_free_long_object(objset_t *os, uint64_t object) { dmu_tx_t *tx; int err; err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } static int dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dmu_buf_t **dbp; int numbufs, err = 0; /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { uint64_t newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); memset((char *)buf + newsz, 0, size - newsz); size = newsz; } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } return (err); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_read_impl(dn, offset, size, buf, flags); dnode_rele(dn, FTAG); return (err); } int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { return (dmu_read_impl(dn, offset, size, buf, flags)); } static void dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { int i; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } /* * Note: Lustre is an external consumer of this interface. */ void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); VERIFY0(dmu_buf_hold_noread(os, object, offset, FTAG, &db)); dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, uncompressed_size, compressed_size, byteorder, tx); dmu_buf_rele(db, FTAG); } void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { int numbufs, i; dmu_buf_t **dbp; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) dmu_buf_redact(dbp[i], tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } #ifdef _KERNEL int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. * Starting at zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_read_uio_dnode(dn, uio, size); DB_DNODE_EXIT(db); return (err); } /* * Read 'size' bytes into the uio buffer. * From the specified object * Starting at offset zfs_uio_offset(uio). */ int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_read_uio_dnode(dn, uio, size); dnode_rele(dn, FTAG); return (err); } int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); /* * XXX zfs_uiomove could block forever (eg.nfs-backed * pages). There needs to be a uiolockdown() function * to lock the pages in memory, so that zfs_uiomove won't * block. */ err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. * Starting at offset zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); } /* * Write 'size' bytes from the uio buffer. * To the specified object. * Starting at offset zfs_uio_offset(uio). */ int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } #endif /* _KERNEL */ /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); arc_buf_destroy(buf, FTAG); } /* * A "lightweight" write is faster than a regular write (e.g. * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t. However, the * data can not be read or overwritten until the transaction's txg has been * synced. This makes it appropriate for workloads that are known to be * (temporarily) write-only, like "zfs receive". * * A single block is written, starting at the specified offset in bytes. If * the call is successful, it returns 0 and the provided abd has been * consumed (the caller should not free it). */ int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx) { dbuf_dirty_record_t *dr = dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx); if (dr == NULL) return (SET_ERROR(EIO)); dr->dt.dll.dr_abd = abd; dr->dt.dll.dr_props = *zp; dr->dt.dll.dr_flags = flags; return (0); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *db; objset_t *os = dn->dn_objset; uint64_t object = dn->dn_object; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, FTAG); if (db == NULL) return (SET_ERROR(EIO)); rw_exit(&dn->dn_struct_rwlock); /* * We can only assign if the offset is aligned and the arc buf is the * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { zfs_racct_write(blksz, 1); dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { /* compressed bufs must always be assignable to their dbuf */ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); } return (0); } int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { int err; dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; DB_DNODE_ENTER(dbuf); err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx); DB_DNODE_EXIT(dbuf); return (err); } typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; zgd_t *dsa_zgd; dmu_tx_t *dsa_tx; } dmu_sync_arg_t; static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { if (BP_IS_HOLE(bp)) { /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); BP_SET_FILL(bp, 1); } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; zgd_t *zgd = dsa->dsa_zgd; /* * Record the vdev(s) backing this blkptr so they can be flushed after * the writes for the lwb have completed. */ if (zio->io_error == 0) { zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); } mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint8_t chksum = BP_GET_CHECKSUM(bp_orig); ASSERT(BP_EQUAL(bp, bp_orig)); VERIFY(BP_EQUAL(bp, db->db_blkptr)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); VERIFY(zio_checksum_table[chksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; /* * Old style holes are filled with all zeros, whereas * new-style holes maintain their lsize, type, level, * and birth time (see zio_write_compress). While we * need to reset the BP_SET_LSIZE() call that happened * in dmu_sync_ready for old style holes, we do *not* * want to wipe out the information contained in new * style holes. Thus, only zero out the block pointer if * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && dr->dt.dl.dr_overridden_by.blk_birth == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; zgd_t *zgd = dsa->dsa_zgd; if (zio->io_error == 0) { /* * Record the vdev(s) backing this blkptr so they can be * flushed after the writes for the lwb have completed. */ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); if (!BP_IS_HOLE(bp)) { blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); abd_free(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; int error; error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL, DB_RF_CANFAIL | DB_RF_NOPREFETCH); if (error != 0) return (error); tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); /* * This transaction does not produce any dirty data or log blocks, so * it should not be throttled. All other cases wait for TXG sync, by * which time the log block we are writing will be obsolete, so we can * skip waiting and just return error here instead. */ if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); } /* * In order to prevent the zgd's lwb from being free'd prior to * dmu_sync_late_arrival_done() being called, we have to ensure * the lwb's "max txg" takes this tx's txg into account. */ zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; /* * Since we are currently syncing this txg, it's nontrivial to * determine what BP to nopwrite against, so we disable nopwrite. * * When syncing, the db_blkptr is initially the BP of the previous * txg. We can not nopwrite against it because it will be changed * (this is similar to the non-late-arrival case where the dbuf is * dirty in a future txg). * * Then dbuf_write_ready() sets bp_blkptr to the location we will write. * We can not nopwrite against it because although the BP will not * (typically) be changed, the data has not yet been persisted to this * location. * * Finally, when dbuf_write_done() is called, it is theoretically * possible to always nopwrite, because the data that was written in * this txg is the same data that we are trying to write. However we * would need to check that this dbuf is not dirty in any future * txg's (as we do in the normal dmu_sync() path). For simplicity, we * don't nopwrite in this case. */ zp->zp_nopwrite = B_FALSE; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr, *dr_next; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } dr_next = list_next(&db->db_dirty_records, dr); ASSERT(dr_next == NULL || dr_next->dr_txg < txg); if (db->db_blkptr != NULL) { /* * We need to fill in zgd_bp with the current blkptr so that * the nopwrite code can check if we're writing the same * data that's already on disk. We can only nopwrite if we * are sure that after making the copy, db_blkptr will not * change until our i/o completes. We ensure this by * holding the db_mtx, and only allowing nopwrite if the * block is not already dirty (see below). This is verified * by dmu_sync_done(), which VERIFYs that the db_blkptr has * not changed. */ *zgd->zgd_bp = *db->db_blkptr; } /* * Assume the on-disk data is X, the current syncing data (in * txg - 1) is Y, and the current in-memory data is Z (currently * in dmu_sync). * * We usually want to perform a nopwrite if X and Z are the * same. However, if Y is different (i.e. the BP is going to * change before this write takes effect), then a nopwrite will * be incorrect - we would override with X, which could have * been freed when Y was written. * * (Note that this is not a concern when we are nop-writing from * syncing context, because X and Y must be identical, because * all previous txgs have been synced.) * * Therefore, we disable nopwrite if the current BP could change * before this TXG. There are two ways it could change: by * being dirty (dr_next is non-NULL), or by being freed * (dnode_block_freed()). This behavior is verified by * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EALREADY)); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_nlevels(dn, nlevels, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (0); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's checksum function. This * check ensures that the receiving system can understand the * checksum function transmitted. */ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's compression function. This * check ensures that the receiving system can understand the * compression function transmitted. */ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ static const int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; uint8_t complevel = os->os_complevel; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; boolean_t encrypt = B_FALSE; int copies = os->os_copies; /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_METADATA) || (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; switch (os->os_redundant_metadata) { case ZFS_REDUNDANT_METADATA_ALL: copies++; break; case ZFS_REDUNDANT_METADATA_MOST: if (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)) copies++; break; case ZFS_REDUNDANT_METADATA_SOME: if (DMU_OT_IS_CRITICAL(type)) copies++; break; case ZFS_REDUNDANT_METADATA_NONE: break; } } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_OFF; } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); complevel = zio_complevel_select(os->os_spa, compress, complevel, complevel); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checksum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and * compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually * exclusive. */ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } /* * All objects in an encrypted objset are protected from modification * via a MAC. Encrypted objects store their IV and salt in the last DVA * in the bp, so we cannot use all copies. Encrypted objects are also * not subject to nopwrite since writing the same data will still * result in a new ciphertext. Only encrypted blocks can be dedup'd * to avoid ambiguity in the dedup code since the DDT does not store * object types. */ if (os->os_encrypted && (wp & WP_NOFILL) == 0) { encrypt = B_TRUE; if (DMU_OT_IS_ENCRYPTED(type)) { copies = MIN(copies, SPA_DVAS_PER_BP - 1); nopwrite = B_FALSE; } else { dedup = B_FALSE; } if (level <= 0 && (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) { compress = ZIO_COMPRESS_EMPTY; } } zp->zp_compress = compress; zp->zp_complevel = complevel; zp->zp_checksum = checksum; zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? os->os_zpl_special_smallblock : 0; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); } /* * Reports the location of data and holes in an object. In order to * accurately report holes all dirty data must be synced to disk. This * causes extremely poor performance when seeking for holes in a dirty file. * As a compromise, only provide hole data when the dnode is clean. When * a dnode is dirty report the dnode as having no holes by returning EBUSY * which is always safe to do. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int restarted = 0, err; restart: err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dnode_is_dirty(dn)) { /* * If the zfs_dmu_offset_next_sync module option is enabled * then hole reporting has been requested. Dirty dnodes * must be synced to disk to accurately report holes. * * Provided a RL_READER rangelock spanning 0-UINT64_MAX is * held by the caller only a single restart will be required. * We tolerate callers which do not hold the rangelock by * returning EBUSY and not reporting holes after one restart. */ if (zfs_dmu_offset_next_sync) { rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (restarted) return (SET_ERROR(EBUSY)); txg_wait_synced(dmu_objset_pool(os), 0); restarted = 1; goto restart; } err = SET_ERROR(EBUSY); } else { err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK | (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (err); } int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, blkptr_t *bps, size_t *nbpsp) { dmu_buf_t **dbp, *dbuf; dmu_buf_impl_t *db; blkptr_t *bp; int error, numbufs; error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG, &numbufs, &dbp); if (error != 0) { if (error == ESRCH) { error = SET_ERROR(ENXIO); } return (error); } ASSERT3U(numbufs, <=, *nbpsp); for (int i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; mutex_enter(&db->db_mtx); if (!list_is_empty(&db->db_dirty_records)) { dbuf_dirty_record_t *dr; dr = list_head(&db->db_dirty_records); if (dr->dt.dl.dr_brtwrite) { /* * This is very special case where we clone a * block and in the same transaction group we * read its BP (most likely to clone the clone). */ bp = &dr->dt.dl.dr_overridden_by; } else { /* * The block was modified in the same * transaction group. */ mutex_exit(&db->db_mtx); error = SET_ERROR(EAGAIN); goto out; } } else { bp = db->db_blkptr; } mutex_exit(&db->db_mtx); if (bp == NULL) { /* * The block was created in this transaction group, * so it has no BP yet. */ error = SET_ERROR(EAGAIN); goto out; } /* * Make sure we clone only data blocks. */ if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) { error = SET_ERROR(EINVAL); goto out; } bps[i] = *bp; } *nbpsp = numbufs; out: dmu_buf_rele_array(dbp, numbufs, FTAG); return (error); } int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, - dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay) + dmu_tx_t *tx, const blkptr_t *bps, size_t nbps) { spa_t *spa; dmu_buf_t **dbp, *dbuf; dmu_buf_impl_t *db; struct dirty_leaf *dl; dbuf_dirty_record_t *dr; const blkptr_t *bp; int error = 0, i, numbufs; spa = os->os_spa; VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG, &numbufs, &dbp)); ASSERT3U(nbps, ==, numbufs); /* * Before we start cloning make sure that the dbufs sizes match new BPs * sizes. If they don't, that's a no-go, as we are not able to shrink * dbufs. */ for (i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_blkid != DMU_SPILL_BLKID); if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) { error = SET_ERROR(EXDEV); goto out; } } for (i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_blkid != DMU_SPILL_BLKID); ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); dmu_buf_will_clone(dbuf, tx); mutex_enter(&db->db_mtx); dr = list_head(&db->db_dirty_records); VERIFY(dr != NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; dl->dr_overridden_by = *bp; dl->dr_brtwrite = B_TRUE; dl->dr_override_state = DR_OVERRIDDEN; if (BP_IS_HOLE(bp)) { dl->dr_overridden_by.blk_birth = 0; dl->dr_overridden_by.blk_phys_birth = 0; } else { dl->dr_overridden_by.blk_birth = dr->dr_txg; if (!BP_IS_EMBEDDED(bp)) { dl->dr_overridden_by.blk_phys_birth = BP_PHYSICAL_BIRTH(bp); } } mutex_exit(&db->db_mtx); /* * When data in embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as * it contains the data. - * Also, when replaying ZIL we don't want to bump references - * in the BRT as it was already done during ZIL claim. */ - if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { brt_pending_add(spa, bp, tx); } } out: dmu_buf_rele_array(dbp, numbufs, FTAG); return (error); } void __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp = dn->dn_phys; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); __dmu_object_info_from_dnode(dn, doi); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add in number of slots used for the dnode itself */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + dn->dn_num_slots; DB_DNODE_EXIT(db); } void dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *dnsize = dn->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } void byteswap_uint8_array(void *vbuf, size_t size) { (void) vbuf, (void) size; } void dmu_init(void) { abd_init(); zfs_dbgmsg_init(); sa_cache_init(); dmu_objset_init(); dnode_init(); zfetch_init(); dmu_tx_init(); l2arc_init(); arc_init(); dbuf_init(); } void dmu_fini(void) { arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); dmu_tx_fini(); zfetch_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); abd_fini(); } EXPORT_SYMBOL(dmu_bonus_hold); EXPORT_SYMBOL(dmu_bonus_hold_by_dnode); EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); EXPORT_SYMBOL(dmu_buf_rele_array); EXPORT_SYMBOL(dmu_prefetch); EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_read_by_dnode); EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write_by_dnode); EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_object_info); EXPORT_SYMBOL(dmu_object_info_from_dnode); EXPORT_SYMBOL(dmu_object_info_from_db); EXPORT_SYMBOL(dmu_object_size_from_db); EXPORT_SYMBOL(dmu_object_dnsize_from_db); EXPORT_SYMBOL(dmu_object_set_nlevels); EXPORT_SYMBOL(dmu_object_set_blocksize); EXPORT_SYMBOL(dmu_object_set_maxblkid); EXPORT_SYMBOL(dmu_object_set_checksum); EXPORT_SYMBOL(dmu_object_set_compress); EXPORT_SYMBOL(dmu_offset_next); EXPORT_SYMBOL(dmu_write_policy); EXPORT_SYMBOL(dmu_sync); EXPORT_SYMBOL(dmu_request_arcbuf); EXPORT_SYMBOL(dmu_return_arcbuf); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf); EXPORT_SYMBOL(dmu_buf_hold); EXPORT_SYMBOL(dmu_ot); ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW, "Enable NOP writes"); ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW, "Percentage of dirtied blocks from frees in one TXG"); ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW, "Enable forcing txg sync to find holes"); /* CSTYLED */ ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW, "Limit one prefetch call to this size"); diff --git a/sys/contrib/openzfs/module/zfs/zfs_replay.c b/sys/contrib/openzfs/module/zfs/zfs_replay.c index 09c7be853bf9..2e0af60f6db4 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_replay.c +++ b/sys/contrib/openzfs/module/zfs/zfs_replay.c @@ -1,1223 +1,1259 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 Cyril Plisko. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * NB: FreeBSD expects to be able to do vnode locking in lookup and * hold the locks across all subsequent VOPs until vput is called. * This means that its zfs vnops routines can't do any internal locking. * In order to have the same contract as the Linux vnops there would * needed to be duplicate locked vnops. If the vnops were used more widely * in common code this would likely be preferable. However, currently * this is the only file where this is the case. */ /* * Functions to replay ZFS intent log (ZIL) records * The functions are called through a function vector (zfs_replay_vector) * which is indexed by the transaction type. */ static void zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) { memset(vap, 0, sizeof (*vap)); vap->va_mask = (uint_t)mask; vap->va_mode = mode; #if defined(__FreeBSD__) || defined(__APPLE__) vap->va_type = IFTOVT(mode); #endif vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; vap->va_rdev = zfs_cmpldev(rdev); vap->va_nodeid = nodeid; } static int zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap) { (void) arg1, (void) arg2, (void) byteswap; return (SET_ERROR(ENOTSUP)); } static void zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) { xoptattr_t *xoap = NULL; uint64_t *attrs; uint64_t *crtime; uint32_t *bitmap; void *scanstamp; int i; xvap->xva_vattr.va_mask |= ATTR_XVATTR; if ((xoap = xva_getxoptattr(xvap)) == NULL) { xvap->xva_vattr.va_mask &= ~ATTR_XVATTR; /* shouldn't happen */ return; } ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize); bitmap = &lrattr->lr_attr_bitmap; for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++) xvap->xva_reqattrmap[i] = *bitmap; attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1); crtime = attrs + 1; scanstamp = (caddr_t)(crtime + 2); if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0); if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0); if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0); if (XVA_ISSET_REQ(xvap, XAT_READONLY)) xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0); if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0); if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0); if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0); if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0); if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0); if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0); if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) xoap->xoa_av_quarantined = ((*attrs & XAT0_AV_QUARANTINED) != 0); if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID)); memcpy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); } else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { /* * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid * at the same time, so we can share the same space. */ memcpy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t)); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0); if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0); if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) xoap->xoa_projinherit = ((*attrs & XAT0_PROJINHERIT) != 0); } static int zfs_replay_domain_cnt(uint64_t uid, uint64_t gid) { uint64_t uid_idx; uint64_t gid_idx; int domcnt = 0; uid_idx = FUID_INDEX(uid); gid_idx = FUID_INDEX(gid); if (uid_idx) domcnt++; if (gid_idx > 0 && gid_idx != uid_idx) domcnt++; return (domcnt); } static void * zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start, int domcnt) { int i; for (i = 0; i != domcnt; i++) { fuid_infop->z_domain_table[i] = start; start = (caddr_t)start + strlen(start) + 1; } return (start); } /* * Set the uid/gid in the fuid_info structure. */ static void zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid) { /* * If owner or group are log specific FUIDs then slurp up * domain information and build zfs_fuid_info_t */ if (IS_EPHEMERAL(uid)) fuid_infop->z_fuid_owner = uid; if (IS_EPHEMERAL(gid)) fuid_infop->z_fuid_group = gid; } /* * Load fuid domains into fuid_info_t */ static zfs_fuid_info_t * zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid) { int domcnt; zfs_fuid_info_t *fuid_infop; fuid_infop = zfs_fuid_info_alloc(); domcnt = zfs_replay_domain_cnt(uid, gid); if (domcnt == 0) return (fuid_infop); fuid_infop->z_domain_table = kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP); zfs_replay_fuid_ugid(fuid_infop, uid, gid); fuid_infop->z_domain_cnt = domcnt; *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt); return (fuid_infop); } /* * load zfs_fuid_t's and fuid_domains into fuid_info_t */ static zfs_fuid_info_t * zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid, uint64_t gid) { uint64_t *log_fuid = (uint64_t *)start; zfs_fuid_info_t *fuid_infop; int i; fuid_infop = zfs_fuid_info_alloc(); fuid_infop->z_domain_cnt = domcnt; fuid_infop->z_domain_table = kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP); for (i = 0; i != idcnt; i++) { zfs_fuid_t *zfuid; zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); zfuid->z_logfuid = *log_fuid; zfuid->z_id = -1; zfuid->z_domidx = 0; list_insert_tail(&fuid_infop->z_fuids, zfuid); log_fuid++; } zfs_replay_fuid_ugid(fuid_infop, uid, gid); *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt); return (fuid_infop); } static void zfs_replay_swap_attrs(lr_attr_t *lrattr) { /* swap the lr_attr structure */ byteswap_uint32_array(lrattr, sizeof (*lrattr)); /* swap the bitmap */ byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) * sizeof (uint32_t)); /* swap the attributes, create time + 64 bit word for attributes */ byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) * (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t)); } /* * Replay file create with optional ACL, xvattr information as well * as option FUID information. */ static int zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_acl_create_t *lracl = arg2; char *name = NULL; /* location determined later */ lr_create_t *lr = (lr_create_t *)lracl; znode_t *dzp; znode_t *zp; xvattr_t xva; int vflg = 0; vsecattr_t vsec = { 0 }; lr_attr_t *lrattr; void *aclstart; void *fuidstart; size_t xvatlen = 0; uint64_t txtype; uint64_t objid; uint64_t dnodesize; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lracl)); + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lracl, sizeof (*lracl)); if (txtype == TX_CREATE_ACL_ATTR || txtype == TX_MKDIR_ACL_ATTR) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); zfs_replay_swap_attrs(lrattr); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); } aclstart = (caddr_t)(lracl + 1) + xvatlen; zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); /* swap fuids */ if (lracl->lr_fuidcnt) { byteswap_uint64_array((caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes), lracl->lr_fuidcnt * sizeof (uint64_t)); } } if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); objid = LR_FOID_GET_OBJ(lr->lr_foid); dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time, generation number, and dnode size. The generic * zfs_create() has no concept of these attributes, so we smuggle * the values inside the vattr's otherwise unused va_ctime, * va_nblocks, and va_fsid fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; xva.xva_vattr.va_fsid = dnodesize; error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); if (error) goto bail; if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; switch (txtype) { case TX_CREATE_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); zfs_fallthrough; case TX_CREATE_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); xva.xva_vattr.va_mask |= ATTR_XVATTR; zfs_replay_xvattr(lrattr, &xva); } vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; vsec.vsa_aclcnt = lracl->lr_aclcnt; vsec.vsa_aclentsz = lracl->lr_acl_bytes; vsec.vsa_aclflags = lracl->lr_acl_flags; if (zfsvfs->z_fuid_replay == NULL) { fuidstart = (caddr_t)(lracl + 1) + xvatlen + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } #if defined(__linux__) error = zfs_create(dzp, name, &xva.xva_vattr, 0, 0, &zp, kcred, vflg, &vsec, zfs_init_idmap); #else error = zfs_create(dzp, name, &xva.xva_vattr, 0, 0, &zp, kcred, vflg, &vsec, NULL); #endif break; case TX_MKDIR_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); zfs_fallthrough; case TX_MKDIR_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr(lrattr, &xva); } vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; vsec.vsa_aclcnt = lracl->lr_aclcnt; vsec.vsa_aclentsz = lracl->lr_acl_bytes; vsec.vsa_aclflags = lracl->lr_acl_flags; if (zfsvfs->z_fuid_replay == NULL) { fuidstart = (caddr_t)(lracl + 1) + xvatlen + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } #if defined(__linux__) error = zfs_mkdir(dzp, name, &xva.xva_vattr, &zp, kcred, vflg, &vsec, zfs_init_idmap); #else error = zfs_mkdir(dzp, name, &xva.xva_vattr, &zp, kcred, vflg, &vsec, NULL); #endif break; default: error = SET_ERROR(ENOTSUP); } bail: if (error == 0 && zp != NULL) { #ifdef __FreeBSD__ VOP_UNLOCK1(ZTOV(zp)); #endif zrele(zp); } zrele(dzp); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; return (error); } static int zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_create_t *lr = arg2; char *name = NULL; /* location determined later */ char *link; /* symlink content follows name */ znode_t *dzp; znode_t *zp = NULL; xvattr_t xva; int vflg = 0; size_t lrsize = sizeof (lr_create_t); lr_attr_t *lrattr; void *start; size_t xvatlen; uint64_t txtype; uint64_t objid; uint64_t dnodesize; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); objid = LR_FOID_GET_OBJ(lr->lr_foid); dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); /* * All forms of zfs create (create, mkdir, mkxattrdir, symlink) * eventually end up in zfs_mknode(), which assigns the object's * creation time, generation number, and dnode slot count. The * generic zfs_create() has no concept of these attributes, so * we smuggle the values inside the vattr's otherwise unused * va_ctime, va_nblocks, and va_fsid fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); xva.xva_vattr.va_nblocks = lr->lr_gen; xva.xva_vattr.va_fsid = dnodesize; error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); if (error) goto out; if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; /* * Symlinks don't have fuid info, and CIFS never creates * symlinks. * * The _ATTR versions will grab the fuid info in their subcases. */ if (txtype != TX_SYMLINK && txtype != TX_MKDIR_ATTR && txtype != TX_CREATE_ATTR) { start = (lr + 1); zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); } switch (txtype) { case TX_CREATE_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); start = (caddr_t)(lr + 1) + xvatlen; zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; zfs_fallthrough; case TX_CREATE: if (name == NULL) name = (char *)start; #if defined(__linux__) error = zfs_create(dzp, name, &xva.xva_vattr, 0, 0, &zp, kcred, vflg, NULL, zfs_init_idmap); #else error = zfs_create(dzp, name, &xva.xva_vattr, 0, 0, &zp, kcred, vflg, NULL, NULL); #endif break; case TX_MKDIR_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); start = (caddr_t)(lr + 1) + xvatlen; zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; zfs_fallthrough; case TX_MKDIR: if (name == NULL) name = (char *)(lr + 1); #if defined(__linux__) error = zfs_mkdir(dzp, name, &xva.xva_vattr, &zp, kcred, vflg, NULL, zfs_init_idmap); #else error = zfs_mkdir(dzp, name, &xva.xva_vattr, &zp, kcred, vflg, NULL, NULL); #endif break; case TX_MKXATTR: error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred); break; case TX_SYMLINK: name = (char *)(lr + 1); link = name + strlen(name) + 1; #if defined(__linux__) error = zfs_symlink(dzp, name, &xva.xva_vattr, link, &zp, kcred, vflg, zfs_init_idmap); #else error = zfs_symlink(dzp, name, &xva.xva_vattr, link, &zp, kcred, vflg, NULL); #endif break; default: error = SET_ERROR(ENOTSUP); } out: if (error == 0 && zp != NULL) { #ifdef __FreeBSD__ VOP_UNLOCK1(ZTOV(zp)); #endif zrele(zp); } zrele(dzp); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; return (error); } static int zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_remove_t *lr = arg2; char *name = (char *)(lr + 1); /* name follows lr_remove_t */ znode_t *dzp; int error; int vflg = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; switch ((int)lr->lr_common.lrc_txtype) { case TX_REMOVE: error = zfs_remove(dzp, name, kcred, vflg); break; case TX_RMDIR: error = zfs_rmdir(dzp, name, NULL, kcred, vflg); break; default: error = SET_ERROR(ENOTSUP); } zrele(dzp); return (error); } static int zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_link_t *lr = arg2; char *name = (char *)(lr + 1); /* name follows lr_link_t */ znode_t *dzp, *zp; int error; int vflg = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { zrele(dzp); return (error); } if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; error = zfs_link(dzp, zp, name, kcred, vflg); zrele(zp); zrele(dzp); return (error); } static int do_zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, char *sname, char *tname, uint64_t rflags, vattr_t *wo_vap) { znode_t *sdzp, *tdzp; int error, vflg = 0; /* Only Linux currently supports RENAME_* flags. */ #ifdef __linux__ VERIFY0(rflags & ~(RENAME_EXCHANGE | RENAME_WHITEOUT)); /* wo_vap must be non-NULL iff. we're doing RENAME_WHITEOUT */ VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); #else VERIFY0(rflags); #endif if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) return (error); if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { zrele(sdzp); return (error); } if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; #if defined(__linux__) error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags, wo_vap, zfs_init_idmap); #else error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags, wo_vap, NULL); #endif zrele(tdzp); zrele(sdzp); return (error); } static int zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; + + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL)); } static int zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap) { #ifdef __linux__ zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; + + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE, NULL)); #else return (SET_ERROR(ENOTSUP)); #endif } static int zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap) { #ifdef __linux__ zfsvfs_t *zfsvfs = arg1; lr_rename_whiteout_t *lr = arg2; int error; - /* sname and tname follow lr_rename_whiteout_t */ - char *sname = (char *)(lr + 1); - char *tname = sname + strlen(sname) + 1; /* For the whiteout file. */ xvattr_t xva; uint64_t objid; uint64_t dnodesize; + ASSERT3U(lr->lr_rename.lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); objid = LR_FOID_GET_OBJ(lr->lr_wfoid); dnodesize = LR_FOID_GET_SLOTS(lr->lr_wfoid) << DNODE_SHIFT; xva_init(&xva); zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, lr->lr_wmode, lr->lr_wuid, lr->lr_wgid, lr->lr_wrdev, objid); /* * As with TX_CREATE, RENAME_WHITEOUT ends up in zfs_mknode(), which * assigns the object's creation time, generation number, and dnode * slot count. The generic zfs_rename() has no concept of these * attributes, so we smuggle the values inside the vattr's otherwise * unused va_ctime, va_nblocks, and va_fsid fields. */ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_wcrtime); xva.xva_vattr.va_nblocks = lr->lr_wgen; xva.xva_vattr.va_fsid = dnodesize; error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); if (error) return (error); + /* sname and tname follow lr_rename_whiteout_t */ + char *sname = (char *)(lr + 1); + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname, RENAME_WHITEOUT, &xva.xva_vattr)); #else return (SET_ERROR(ENOTSUP)); #endif } static int zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_write_t *lr = arg2; char *data = (char *)(lr + 1); /* data follows lr_write_t */ znode_t *zp; int error; uint64_t eod, offset, length; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * As we can log writes out of order, it's possible the * file has been removed. In this case just drop the write * and return success. */ if (error == ENOENT) error = 0; return (error); } offset = lr->lr_offset; length = lr->lr_length; eod = offset + length; /* end of data for this write */ /* * This may be a write from a dmu_sync() for a whole block, * and may extend beyond the current end of the file. * We can't just replay what was written for this TX_WRITE as * a future TX_WRITE2 may extend the eof and the data for that * write needs to be there. So we write the whole block and * reduce the eof. This needs to be done within the single dmu * transaction created within vn_rdwr -> zfs_write. So a possible * new end of file is passed through in zfsvfs->z_replay_eof */ zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } if (zp->z_size < eod) zfsvfs->z_replay_eof = eod; } error = zfs_write_simple(zp, data, length, offset, NULL); zrele(zp); zfsvfs->z_replay_eof = 0; /* safety */ return (error); } /* * TX_WRITE2 are only generated when dmu_sync() returns EALREADY * meaning the pool block is already being synced. So now that we always write * out full blocks, all we have to do is expand the eof if * the file is grown. */ static int zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_write_t *lr = arg2; znode_t *zp; int error; uint64_t end; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); top: end = lr->lr_offset + lr->lr_length; if (end > zp->z_size) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); zp->z_size = end; dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zrele(zp); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); return (error); } (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); /* Ensure the replayed seq is updated */ (void) zil_replaying(zfsvfs->z_log, tx); dmu_tx_commit(tx); } zrele(zp); return (error); } static int zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_truncate_t *lr = arg2; znode_t *zp; flock64_t fl = {0}; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); fl.l_type = F_WRLCK; fl.l_whence = SEEK_SET; fl.l_start = lr->lr_offset; fl.l_len = lr->lr_length; error = zfs_space(zp, F_FREESP, &fl, O_RDWR | O_LARGEFILE, lr->lr_offset, kcred); zrele(zp); return (error); } static int zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_setattr_t *lr = arg2; znode_t *zp; xvattr_t xva; vattr_t *vap = &xva.xva_vattr; int error; void *start; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + xva_init(&xva); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); if ((lr->lr_mask & ATTR_XVATTR) && zfsvfs->z_version >= ZPL_VERSION_INITIAL) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); vap->va_size = lr->lr_size; ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); gethrestime(&vap->va_ctime); vap->va_mask |= ATTR_CTIME; /* * Fill in xvattr_t portions if necessary. */ start = (lr_setattr_t *)(lr + 1); if (vap->va_mask & ATTR_XVATTR) { zfs_replay_xvattr((lr_attr_t *)start, &xva); start = (caddr_t)start + ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); } else xva.xva_vattr.va_mask &= ~ATTR_XVATTR; zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); #if defined(__linux__) error = zfs_setattr(zp, vap, 0, kcred, zfs_init_idmap); #else error = zfs_setattr(zp, vap, 0, kcred, NULL); #endif zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; zrele(zp); return (error); } static int zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_setsaxattr_t *lr = arg2; znode_t *zp; nvlist_t *nvl; size_t sa_size; char *name; char *value; size_t size; int error = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr) + lr->lr_size); + ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa, SPA_FEATURE_ZILSAXATTR)); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); rw_enter(&zp->z_xattr_lock, RW_WRITER); mutex_enter(&zp->z_lock); if (zp->z_xattr_cached == NULL) error = zfs_sa_get_xattr(zp); mutex_exit(&zp->z_lock); if (error) goto out; ASSERT(zp->z_xattr_cached); nvl = zp->z_xattr_cached; /* Get xattr name, value and size from log record */ size = lr->lr_size; name = (char *)(lr + 1); if (size == 0) { value = NULL; error = nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); } else { value = name + strlen(name) + 1; /* Limited to 32k to keep nvpair memory allocations small */ if (size > DXATTR_MAX_ENTRY_SIZE) { error = SET_ERROR(EFBIG); goto out; } /* Prevent the DXATTR SA from consuming the entire SA region */ error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); if (error) goto out; if (sa_size > DXATTR_MAX_SA_SIZE) { error = SET_ERROR(EFBIG); goto out; } error = nvlist_add_byte_array(nvl, name, (uchar_t *)value, size); } /* * Update the SA for additions, modifications, and removals. On * error drop the inconsistent cached version of the nvlist, it * will be reconstructed from the ARC when next accessed. */ if (error == 0) error = zfs_sa_set_xattr(zp, name, value, size); if (error) { nvlist_free(nvl); zp->z_xattr_cached = NULL; } out: rw_exit(&zp->z_xattr_lock); zrele(zp); return (error); } static int zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_acl_v0_t *lr = arg2; ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ vsecattr_t vsa = {0}; znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + + sizeof (ace_t) * lr->lr_aclcnt); + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_oldace_byteswap(ace, lr->lr_aclcnt); } if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); vsa.vsa_mask = VSA_ACE | VSA_ACECNT; vsa.vsa_aclcnt = lr->lr_aclcnt; vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt; vsa.vsa_aclflags = 0; vsa.vsa_aclentp = ace; error = zfs_setsecattr(zp, &vsa, 0, kcred); zrele(zp); return (error); } /* * Replaying ACLs is complicated by FUID support. * The log record may contain some optional data * to be used for replaying FUID's. These pieces * are the actual FUIDs that were created initially. * The FUID table index may no longer be valid and * during zfs_create() a new index may be assigned. * Because of this the log will contain the original * domain+rid in order to create a new FUID. * * The individual ACEs may contain an ephemeral uid/gid which is no * longer valid and will need to be replaced with an actual FUID. * */ static int zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_acl_t *lr = arg2; ace_t *ace = (ace_t *)(lr + 1); vsecattr_t vsa = {0}; znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + lr->lr_acl_bytes); + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); if (lr->lr_fuidcnt) { byteswap_uint64_array((caddr_t)ace + ZIL_ACE_LENGTH(lr->lr_acl_bytes), lr->lr_fuidcnt * sizeof (uint64_t)); } } if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; vsa.vsa_aclcnt = lr->lr_aclcnt; vsa.vsa_aclentp = ace; vsa.vsa_aclentsz = lr->lr_acl_bytes; vsa.vsa_aclflags = lr->lr_acl_flags; if (lr->lr_fuidcnt) { void *fuidstart = (caddr_t)ace + ZIL_ACE_LENGTH(lr->lr_acl_bytes); zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, &fuidstart, lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); } error = zfs_setsecattr(zp, &vsa, 0, kcred); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; zrele(zp); return (error); } static int zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_clone_range_t *lr = arg2; znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { /* * Clones can be logged out of order, so don't be surprised if * the file is gone - just return success. */ if (error == ENOENT) error = 0; return (error); } error = zfs_clone_range_replay(zp, lr->lr_offset, lr->lr_length, lr->lr_blksz, lr->lr_bps, lr->lr_nbps); zrele(zp); return (error); } /* * Callback vectors for replaying records */ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_error, /* no such type */ zfs_replay_create, /* TX_CREATE */ zfs_replay_create, /* TX_MKDIR */ zfs_replay_create, /* TX_MKXATTR */ zfs_replay_create, /* TX_SYMLINK */ zfs_replay_remove, /* TX_REMOVE */ zfs_replay_remove, /* TX_RMDIR */ zfs_replay_link, /* TX_LINK */ zfs_replay_rename, /* TX_RENAME */ zfs_replay_write, /* TX_WRITE */ zfs_replay_truncate, /* TX_TRUNCATE */ zfs_replay_setattr, /* TX_SETATTR */ zfs_replay_acl_v0, /* TX_ACL_V0 */ zfs_replay_acl, /* TX_ACL */ zfs_replay_create_acl, /* TX_CREATE_ACL */ zfs_replay_create, /* TX_CREATE_ATTR */ zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL */ zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ zfs_replay_write2, /* TX_WRITE2 */ zfs_replay_setsaxattr, /* TX_SETSAXATTR */ zfs_replay_rename_exchange, /* TX_RENAME_EXCHANGE */ zfs_replay_rename_whiteout, /* TX_RENAME_WHITEOUT */ zfs_replay_clone_range, /* TX_CLONE_RANGE */ }; diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 635d17455981..eb012fe549dc 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -1,1499 +1,1499 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) { int error = 0; zfsvfs_t *zfsvfs = ZTOZSB(zp); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); atomic_inc_32(&zp->z_sync_writes_cnt); zil_commit(zfsvfs->z_log, zp->z_id); atomic_dec_32(&zp->z_sync_writes_cnt); zfs_exit(zfsvfs, FTAG); } return (error); } #if defined(SEEK_HOLE) && defined(SEEK_DATA) /* * Lseek support for finding holes (cmd == SEEK_HOLE) and * data (cmd == SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) { zfs_locked_range_t *lr; uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == F_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; /* Flush any mmap()'d data to disk */ if (zn_has_cached_data(zp, 0, file_sz - 1)) zn_flush_cached_data(zp, B_FALSE); lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); zfs_rangelock_exit(lr); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* File was dirty, so fall back to using generic logic */ if (error == EBUSY) { if (hole) *off = file_sz; return (0); } /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } int zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); error = zfs_holey_common(zp, cmd, off); zfs_exit(zfsvfs, FTAG); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ int zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if (flag & V_ACE_MASK) #if defined(__linux__) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, zfs_init_idmap); #else error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, NULL); #endif else #if defined(__linux__) error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap); #else error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); #endif zfs_exit(zfsvfs, FTAG); return (error); } static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: zp - inode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - O_SYNC flags; used to provide FRSYNC semantics. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * inode - atime updated if byte count > 0 */ int zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { (void) cr; int error = 0; boolean_t frsync = B_FALSE; zfsvfs_t *zfsvfs = ZTOZSB(zp); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if (zp->z_pflags & ZFS_AV_QUARANTINED) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EACCES)); } /* We don't copy out anything useful for directories. */ if (Z_ISDIR(ZTOTYPE(zp))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } /* * Validate file offset */ if (zfs_uio_offset(uio) < (offset_t)0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (zfs_uio_resid(uio) == 0) { zfs_exit(zfsvfs, FTAG); return (0); } #ifdef FRSYNC /* * If we're in FRSYNC mode, sync out this znode before reading it. * Only do this for non-snapshots. * * Some platforms do not support FRSYNC and instead map it * to O_SYNC, which results in unnecessary calls to zil_commit. We * only honor FRSYNC requests on platforms which support it. */ frsync = !!(ioflag & FRSYNC); #endif if (zfsvfs->z_log && (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (zfs_uio_offset(uio) >= zp->z_size) { error = 0; goto out; } ASSERT(zfs_uio_offset(uio) < zp->z_size); #if defined(__linux__) ssize_t start_offset = zfs_uio_offset(uio); #endif ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); ssize_t start_resid = n; while (n > 0) { ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); #ifdef UIO_NOCOPY if (zfs_uio_segflg(uio) == UIO_NOCOPY) error = mappedread_sf(zp, nbytes, uio); else #endif if (zn_has_cached_data(zp, zfs_uio_offset(uio), zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) { error = mappedread(zp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); #if defined(__linux__) /* * if we actually read some bytes, bubbling EFAULT * up to become EAGAIN isn't what we want here... * * ...on Linux, at least. On FBSD, doing this breaks. */ if (error == EFAULT && (zfs_uio_offset(uio) - start_offset) != 0) error = 0; #endif break; } n -= nbytes; } int64_t nread = start_resid - n; dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); task_io_account_read(nread); out: zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); zfs_exit(zfsvfs, FTAG); return (error); } static void zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx) { zilog_t *zilog = zfsvfs->z_log; const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); ASSERT(clear_setid_bits_txgp != NULL); ASSERT(tx != NULL); /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the execute bits is set. * * It would be nice to do this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(zp, cr, ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); mutex_exit(&zp->z_acl_lock); /* * Make sure SUID/SGID bits will be removed when we replay the * log. If the setid bits are keep coming back, don't log more * than one TX_SETATTR per transaction group. */ if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { vattr_t va = {0}; va.va_mask = ATTR_MODE; va.va_nodeid = zp->z_id; va.va_mode = newmode; zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, ATTR_MODE, NULL); *clear_setid_bits_txgp = dmu_tx_get_txg(tx); } } else { mutex_exit(&zp->z_acl_lock); } } /* * Write the bytes to a file. * * IN: zp - znode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - O_APPEND flag set if in append mode. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated if byte count > 0 */ int zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0, error1; ssize_t start_resid = zfs_uio_resid(uio); uint64_t clear_setid_bits_txg = 0; /* * Fasttrack empty write */ ssize_t n = start_resid; if (n == 0) return (0); zfsvfs_t *zfsvfs = ZTOZSB(zp); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); sa_bulk_attr_t bulk[4]; int count = 0; uint64_t mtime[2], ctime[2]; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM. * Intentionally allow ZFS_READONLY through here. * See zfs_zaccess_common() */ if ((zp->z_pflags & ZFS_IMMUTABLE) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && (zfs_uio_offset(uio) < zp->z_size))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } /* * Validate file offset */ offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); if (woff < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. */ ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); if (zfs_uio_prefaultpages(pfbytes, uio)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFAULT)); } /* * If in append mode, set the io offset pointer to eof. */ zfs_locked_range_t *lr; if (ioflag & O_APPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); woff = lr->lr_offset; if (lr->lr_length == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } zfs_uio_setoffset(uio, woff); } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } if (zn_rlimit_fsize_uio(zp, uio)) { zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFBIG)); } const rlim64_t limit = MAXOFFSET_T; if (woff >= limit) { zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFBIG)); } if (n > limit - woff) n = limit - woff; uint64_t end_size = MAX(zp->z_size, woff + n); zilog_t *zilog = zfsvfs->z_log; boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) || (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS); const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); const uint64_t projid = zp->z_projid; /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { woff = zfs_uio_offset(uio); if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || (projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid))) { error = SET_ERROR(EDQUOT); break; } uint64_t blksz; if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { if (zp->z_blksz > zfsvfs->z_max_blksz && !ISP2(zp->z_blksz)) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ blksz = 1 << highbit64(zp->z_blksz); } else { blksz = zfsvfs->z_max_blksz; } blksz = MIN(blksz, P2ROUNDUP(end_size, SPA_MINBLOCKSIZE)); blksz = MAX(blksz, zp->z_blksz); } else { blksz = zp->z_blksz; } arc_buf_t *abuf = NULL; ssize_t nbytes = n; if (n >= blksz && woff >= zp->z_size && P2PHASE(woff, blksz) == 0 && (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == blksz); if ((error = zfs_uiocopy(abuf->b_data, blksz, UIO_WRITE, uio, &nbytes))) { dmu_return_arcbuf(abuf); break; } ASSERT3S(nbytes, ==, blksz); } else { nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - P2PHASE(woff, blksz)); if (pfbytes < nbytes) { if (zfs_uio_prefaultpages(nbytes, uio)) { error = SET_ERROR(EFAULT); break; } pfbytes = nbytes; } } /* * Start a transaction. */ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * NB: We must call zfs_clear_setid_bits_if_necessary before * committing the transaction! */ /* * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since rangelock_reduce() will * shrink down lr_length to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { zfs_grow_blocksize(zp, blksz, tx); zfs_rangelock_reduce(lr, woff, n); } ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = zfs_uio_resid(uio); zfs_uio_fault_disable(uio, B_TRUE); error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); zfs_uio_fault_disable(uio, B_FALSE); #ifdef __linux__ if (error == EFAULT) { zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, &clear_setid_bits_txg, tx); dmu_tx_commit(tx); /* * Account for partial writes before * continuing the loop. * Update needs to occur before the next * zfs_uio_prefaultpages, or prefaultpages may * error, and we may break the loop early. */ n -= tx_bytes - zfs_uio_resid(uio); pfbytes -= tx_bytes - zfs_uio_resid(uio); continue; } #endif /* * On FreeBSD, EFAULT should be propagated back to the * VFS, which will handle faulting and will retry. */ if (error != 0 && error != EFAULT) { zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, &clear_setid_bits_txg, tx); dmu_tx_commit(tx); break; } tx_bytes -= zfs_uio_resid(uio); } else { /* * Thus, we're writing a full block at a block-aligned * offset and extending the file past EOF. * * dmu_assign_arcbuf_by_dbuf() will directly assign the * arc buffer to a dbuf. */ error = dmu_assign_arcbuf_by_dbuf( sa_get_db(zp->z_sa_hdl), woff, abuf, tx); if (error != 0) { /* * XXX This might not be necessary if * dmu_assign_arcbuf_by_dbuf is guaranteed * to be atomic. */ zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, &clear_setid_bits_txg, tx); dmu_return_arcbuf(abuf); dmu_tx_commit(tx); break; } ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } if (tx_bytes && zn_has_cached_data(zp, woff, woff + tx_bytes - 1) && !(ioflag & O_DIRECT)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, &clear_setid_bits_txg, tx); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { (void) atomic_cas_64(&zp->z_size, end_size, zfs_uio_offset(uio)); ASSERT(error == 0 || error == EFAULT); } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); if (error1 != 0) /* Avoid clobbering EFAULT. */ error = error1; /* * NB: During replay, the TX_SETATTR record logged by * zfs_clear_setid_bits_if_necessary must precede any of * the TX_WRITE records logged here. */ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit, NULL, NULL); dmu_tx_commit(tx); if (error != 0) break; ASSERT3S(tx_bytes, ==, nbytes); n -= nbytes; pfbytes -= nbytes; } zfs_znode_update_vfs(zp); zfs_rangelock_exit(lr); /* * If we're in replay mode, or we made no progress, or the * uio data is inaccessible return an error. Otherwise, it's * at least a partial write, so it's successful. */ if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || error == EFAULT) { zfs_exit(zfsvfs, FTAG); return (error); } if (commit) zil_commit(zilog, zp->z_id); const int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); task_io_account_write(nwritten); zfs_exit(zfsvfs, FTAG); return (0); } int zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); error = zfs_getacl(zp, vsecp, skipaclchk, cr); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } #ifdef ZFS_DEBUG static int zil_fault_io = 0; #endif static void zfs_get_done(zgd_t *zgd, int error); /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error = 0; uint64_t zp_gen; ASSERT3P(lwb, !=, NULL); ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); return (SET_ERROR(ENOENT)); } /* check if generation number matches */ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (zp_gen)) != 0) { zfs_zrele_async(zp); return (SET_ERROR(EIO)); } if (zp_gen != gen) { zfs_zrele_async(zp); return (SET_ERROR(ENOENT)); } zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ ASSERT3P(zio, !=, NULL); /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef ZFS_DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold_noread(os, object, offset, zgd, &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; /* * TX_WRITE2 relies on the data previously * written by the TX_WRITE that caused * EALREADY. We zero out the BP because * it is the old, currently-on-disk BP. */ zgd->zgd_bp = NULL; BP_ZERO(bp); error = 0; } } } zfs_get_done(zgd, error); return (error); } static void zfs_get_done(zgd_t *zgd, int error) { (void) error; znode_t *zp = zgd->zgd_private; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); kmem_free(zgd, sizeof (zgd_t)); } static int zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) { int error; /* Swap. Not sure if the order of zfs_enter()s is important. */ if (zfsvfs1 > zfsvfs2) { zfsvfs_t *tmpzfsvfs; tmpzfsvfs = zfsvfs2; zfsvfs2 = zfsvfs1; zfsvfs1 = tmpzfsvfs; } error = zfs_enter(zfsvfs1, tag); if (error != 0) return (error); if (zfsvfs1 != zfsvfs2) { error = zfs_enter(zfsvfs2, tag); if (error != 0) { zfs_exit(zfsvfs1, tag); return (error); } } return (0); } static void zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) { zfs_exit(zfsvfs1, tag); if (zfsvfs1 != zfsvfs2) zfs_exit(zfsvfs2, tag); } /* * We split each clone request in chunks that can fit into a single ZIL * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives * us room for storing 1022 block pointers. * * On success, the function return the number of bytes copied in *lenp. * Note, it doesn't return how much bytes are left to be copied. * On errors which are caused by any file system limitations or * brt limitations `EINVAL` is returned. In the most cases a user * requested bad parameters, it could be possible to clone the file but * some parameters don't match the requirements. */ int zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, uint64_t *outoffp, uint64_t *lenp, cred_t *cr) { zfsvfs_t *inzfsvfs, *outzfsvfs; objset_t *inos, *outos; zfs_locked_range_t *inlr, *outlr; dmu_buf_impl_t *db; dmu_tx_t *tx; zilog_t *zilog; uint64_t inoff, outoff, len, done; uint64_t outsize, size; int error; int count = 0; sa_bulk_attr_t bulk[3]; uint64_t mtime[2], ctime[2]; uint64_t uid, gid, projid; blkptr_t *bps; size_t maxblocks, nbps; uint_t inblksz; uint64_t clear_setid_bits_txg = 0; inoff = *inoffp; outoff = *outoffp; len = *lenp; done = 0; inzfsvfs = ZTOZSB(inzp); outzfsvfs = ZTOZSB(outzp); /* * We need to call zfs_enter() potentially on two different datasets, * so we need a dedicated function for that. */ error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); if (error != 0) return (error); inos = inzfsvfs->z_os; outos = outzfsvfs->z_os; /* * Both source and destination have to belong to the same storage pool. */ if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); return (SET_ERROR(EXDEV)); } /* * outos and inos belongs to the same storage pool. * see a few lines above, only one check. */ if (!spa_feature_is_enabled(dmu_objset_spa(outos), SPA_FEATURE_BLOCK_CLONING)) { zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); return (SET_ERROR(EOPNOTSUPP)); } ASSERT(!outzfsvfs->z_replay); /* * Block cloning from an unencrypted dataset into an encrypted * dataset and vice versa is not supported. */ if (inos->os_encrypted != outos->os_encrypted) { zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); return (SET_ERROR(EXDEV)); } error = zfs_verify_zp(inzp); if (error == 0) error = zfs_verify_zp(outzp); if (error != 0) { zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); return (error); } /* * We don't copy source file's flags that's why we don't allow to clone * files that are in quarantine. */ if (inzp->z_pflags & ZFS_AV_QUARANTINED) { zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); return (SET_ERROR(EACCES)); } if (inoff >= inzp->z_size) { *lenp = 0; zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); return (0); } if (len > inzp->z_size - inoff) { len = inzp->z_size - inoff; } if (len == 0) { *lenp = 0; zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); return (0); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(outzfsvfs)) { zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM. * Intentionally allow ZFS_READONLY through here. * See zfs_zaccess_common() */ if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) { zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); return (SET_ERROR(EPERM)); } /* * No overlapping if we are cloning within the same file. */ if (inzp == outzp) { if (inoff < outoff + len && outoff < inoff + len) { zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); return (SET_ERROR(EINVAL)); } } /* * Maintain predictable lock order. */ if (inzp < outzp || (inzp == outzp && inoff < outoff)) { inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, RL_READER); outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, RL_WRITER); } else { outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, RL_WRITER); inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, RL_READER); } inblksz = inzp->z_blksz; /* * We cannot clone into files with different block size if we can't * grow it (block size is already bigger or more than one block). */ if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || outzp->z_size > inblksz)) { error = SET_ERROR(EINVAL); goto unlock; } /* * Block size must be power-of-2 if destination offset != 0. * There can be no multiple blocks of non-power-of-2 size. */ if (outoff != 0 && !ISP2(inblksz)) { error = SET_ERROR(EINVAL); goto unlock; } /* * Offsets and len must be at block boundries. */ if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { error = SET_ERROR(EINVAL); goto unlock; } /* * Length must be multipe of blksz, except for the end of the file. */ if ((len % inblksz) != 0 && (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { error = SET_ERROR(EINVAL); goto unlock; } /* * If we are copying only one block and it is smaller than recordsize * property, do not allow destination to grow beyond one block if it * is not there yet. Otherwise the destination will get stuck with * that block size forever, that can be as small as 512 bytes, no * matter how big the destination grow later. */ if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz && outzp->z_size <= inblksz && outoff + len > inblksz) { error = SET_ERROR(EINVAL); goto unlock; } error = zn_rlimit_fsize(outoff + len); if (error != 0) { goto unlock; } if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) { error = SET_ERROR(EFBIG); goto unlock; } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL, &outzp->z_size, 8); zilog = outzfsvfs->z_log; maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) / sizeof (bps[0]); uid = KUID_TO_SUID(ZTOUID(outzp)); gid = KGID_TO_SGID(ZTOGID(outzp)); projid = outzp->z_projid; bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); /* * Clone the file in reasonable size chunks. Each chunk is cloned * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (len > 0) { size = MIN(inblksz * maxblocks, len); if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT, uid) || zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT, gid) || (projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT, projid))) { error = SET_ERROR(EDQUOT); break; } nbps = maxblocks; error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, &nbps); if (error != 0) { /* * If we are trying to clone a block that was created * in the current transaction group, error will be * EAGAIN here, which we can just return to the caller * so it can fallback if it likes. */ break; } /* * Encrypted data is fine as long as it comes from the same * dataset. * TODO: We want to extend it in the future to allow cloning to * datasets with the same keys, like clones or to be able to * clone a file from a snapshot of an encrypted dataset into the * dataset itself. */ if (BP_IS_PROTECTED(&bps[0])) { if (inzfsvfs != outzfsvfs) { error = SET_ERROR(EXDEV); break; } } /* * Start a transaction. */ tx = dmu_tx_create(outos); dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); DB_DNODE_ENTER(db); dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, outzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); break; } /* * Copy source znode's block size. This only happens on the * first iteration since zfs_rangelock_reduce() will shrink down * lr_len to the appropriate size. */ if (outlr->lr_length == UINT64_MAX) { zfs_grow_blocksize(outzp, inblksz, tx); /* * Round range lock up to the block boundary, so we * prevent appends until we are done. */ zfs_rangelock_reduce(outlr, outoff, ((len - 1) / inblksz + 1) * inblksz); } error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, - bps, nbps, B_FALSE); + bps, nbps); if (error != 0) { dmu_tx_commit(tx); break; } zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, &clear_setid_bits_txg, tx); zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((outsize = outzp->z_size) < outoff + size) { (void) atomic_cas_64(&outzp->z_size, outsize, outoff + size); } error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx); zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff, size, inblksz, bps, nbps); dmu_tx_commit(tx); if (error != 0) break; inoff += size; outoff += size; len -= size; done += size; } vmem_free(bps, sizeof (bps[0]) * maxblocks); zfs_znode_update_vfs(outzp); unlock: zfs_rangelock_exit(outlr); zfs_rangelock_exit(inlr); if (done > 0) { /* * If we have made at least partial progress, reset the error. */ error = 0; ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp); if (outos->os_sync == ZFS_SYNC_ALWAYS) { zil_commit(zilog, outzp->z_id); } *inoffp += done; *outoffp += done; *lenp = done; } else { /* * If we made no progress, there must be a good reason. * EOF is handled explicitly above, before the loop. */ ASSERT3S(error, !=, 0); } zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); return (error); } /* * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(), * but we cannot do that, because when replaying we don't have source znode * available. This is why we need a dedicated replay function. */ int zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps) { zfsvfs_t *zfsvfs; dmu_buf_impl_t *db; dmu_tx_t *tx; int error; int count = 0; sa_bulk_attr_t bulk[3]; uint64_t mtime[2], ctime[2]; ASSERT3U(off, <, MAXOFFSET_T); ASSERT3U(len, >, 0); ASSERT3U(nbps, >, 0); zfsvfs = ZTOZSB(zp); ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), SPA_FEATURE_BLOCK_CLONING)); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); ASSERT(zfsvfs->z_replay); ASSERT(!zfs_is_readonly(zfsvfs)); if ((off % blksz) != 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } if (zp->z_blksz < blksz) zfs_grow_blocksize(zp, blksz, tx); - dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE); + dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); if (zp->z_size < off + len) zp->z_size = off + len; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); /* * zil_replaying() not only check if we are replaying ZIL, but also * updates the ZIL header to record replay progress. */ VERIFY(zil_replaying(zfsvfs->z_log, tx)); dmu_tx_commit(tx); zfs_znode_update_vfs(zp); zfs_exit(zfsvfs, FTAG); return (error); } EXPORT_SYMBOL(zfs_access); EXPORT_SYMBOL(zfs_fsync); EXPORT_SYMBOL(zfs_holey); EXPORT_SYMBOL(zfs_read); EXPORT_SYMBOL(zfs_write); EXPORT_SYMBOL(zfs_getsecattr); EXPORT_SYMBOL(zfs_setsecattr); EXPORT_SYMBOL(zfs_clone_range); EXPORT_SYMBOL(zfs_clone_range_replay); ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, "Bytes to read per chunk"); diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index b2699caa7589..7dfdaa081e8a 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -1,4222 +1,4242 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2018 Datto Inc. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system * calls that change the file system. Each itx has enough information to * be able to replay them after a system crash, power loss, or * equivalent failure mode. These are stored in memory until either: * * 1. they are committed to the pool by the DMU transaction group * (txg), at which point they can be discarded; or * 2. they are committed to the on-disk ZIL for the dataset being * modified (e.g. due to an fsync, O_DSYNC, or other synchronous * requirement). * * In the event of a crash or power loss, the itxs contained by each * dataset's on-disk ZIL will be replayed when that dataset is first * instantiated (e.g. if the dataset is a normal filesystem, when it is * first mounted). * * As hinted at above, there is one ZIL per dataset (both the in-memory * representation, and the on-disk representation). The on-disk format * consists of 3 parts: * * - a single, per-dataset, ZIL header; which points to a chain of * - zero or more ZIL blocks; each of which contains * - zero or more ZIL records * * A ZIL record holds the information necessary to replay a single * system call transaction. A ZIL block can hold many ZIL records, and * the blocks are chained together, similarly to a singly linked list. * * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL * block in the chain, and the ZIL header points to the first block in * the chain. * * Note, there is not a fixed place in the pool to hold these ZIL * blocks; they are dynamically allocated and freed as needed from the * blocks available on the pool, though they can be preferentially * allocated from a dedicated "log" vdev. */ /* * This controls the amount of time that a ZIL block (lwb) will remain * "open" when it isn't "full", and it has a thread waiting for it to be * committed to stable storage. Please refer to the zil_commit_waiter() * function (and the comments within it) for more details. */ static uint_t zfs_commit_timeout_pct = 10; /* * See zil.h for more information about these fields. */ static zil_kstat_values_t zil_stats = { { "zil_commit_count", KSTAT_DATA_UINT64 }, { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, { "zil_itx_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_copied_count", KSTAT_DATA_UINT64 }, { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 }, { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }, }; static zil_sums_t zil_sums_global; static kstat_t *zil_kstats_global; /* * Disable intent logging replay. This global ZIL switch affects all pools. */ int zil_replay_disable = 0; /* * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to * the disk(s) by the ZIL after an LWB write has completed. Setting this * will cause ZIL corruption on power loss if a volatile out-of-order * write cache is enabled. */ static int zil_nocacheflush = 0; /* * Limit SLOG write size per commit executed with synchronous priority. * Any writes above that will be executed with lower (asynchronous) priority * to limit potential SLOG device abuse by single active ZIL writer. */ static uint64_t zil_slog_bulk = 64 * 1024 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); static itx_t *zil_itx_clone(itx_t *oitx); static int zil_bp_compare(const void *x1, const void *x2) { const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); if (likely(cmp)) return (cmp); return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); } static void zil_bp_tree_init(zilog_t *zilog) { avl_create(&zilog->zl_bp_tree, zil_bp_compare, sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); } static void zil_bp_tree_fini(zilog_t *zilog) { avl_tree_t *t = &zilog->zl_bp_tree; zil_bp_node_t *zn; void *cookie = NULL; while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zn, sizeof (zil_bp_node_t)); avl_destroy(t); } int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_bp_tree; const dva_t *dva; zil_bp_node_t *zn; avl_index_t where; if (BP_IS_EMBEDDED(bp)) return (0); dva = BP_IDENTITY(bp); if (avl_find(t, dva, &where) != NULL) return (SET_ERROR(EEXIST)); zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); zn->zn_dva = *dva; avl_insert(t, zn, where); return (0); } static zil_header_t * zil_header_in_syncing_context(zilog_t *zilog) { return ((zil_header_t *)zilog->zl_header); } static void zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) { zio_cksum_t *zc = &bp->blk_cksum; (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0], sizeof (zc->zc_word[ZIL_ZC_GUID_0])); (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1], sizeof (zc->zc_word[ZIL_ZC_GUID_1])); zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); zc->zc_word[ZIL_ZC_SEQ] = 1ULL; } static int zil_kstats_global_update(kstat_t *ksp, int rw) { zil_kstat_values_t *zs = ksp->ks_data; ASSERT3P(&zil_stats, ==, zs); if (rw == KSTAT_WRITE) { return (SET_ERROR(EACCES)); } zil_kstat_values_update(zs, &zil_sums_global); return (0); } /* * Read a log block and make sure it's valid. */ static int zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; int error; if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) zio_flags |= ZIO_FLAG_SPECULATIVE; if (!decrypt) zio_flags |= ZIO_FLAG_RAW; SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; /* * Validate the checksummed log block. * * Sequence numbers should be... sequential. The checksum * verifier for the next block should be bp's checksum plus 1. * * Also check the log chain linkage and size used. */ cksum.zc_word[ZIL_ZC_SEQ]++; uint64_t size = BP_GET_LSIZE(bp); if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = (*abuf)->b_data; char *lr = (char *)(zilc + 1); if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || zilc->zc_nused < sizeof (*zilc) || zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); } else { *begin = lr; *end = lr + zilc->zc_nused - sizeof (*zilc); *nbp = zilc->zc_next_blk; } } else { char *lr = (*abuf)->b_data; zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { *begin = lr; *end = lr + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } } return (error); } /* * Read a TX_WRITE log data block. */ static int zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; if (BP_IS_HOLE(bp)) { if (wbuf != NULL) memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length)); return (0); } if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; /* * If we are not using the resulting data, we are just checking that * it hasn't been corrupted so we don't need to waste CPU time * decompressing and decrypting it. */ if (wbuf == NULL) zio_flags |= ZIO_FLAG_RAW; ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { if (wbuf != NULL) memcpy(wbuf, abuf->b_data, arc_buf_size(abuf)); arc_buf_destroy(abuf, &abuf); } return (error); } void zil_sums_init(zil_sums_t *zs) { wmsum_init(&zs->zil_commit_count, 0); wmsum_init(&zs->zil_commit_writer_count, 0); wmsum_init(&zs->zil_itx_count, 0); wmsum_init(&zs->zil_itx_indirect_count, 0); wmsum_init(&zs->zil_itx_indirect_bytes, 0); wmsum_init(&zs->zil_itx_copied_count, 0); wmsum_init(&zs->zil_itx_copied_bytes, 0); wmsum_init(&zs->zil_itx_needcopy_count, 0); wmsum_init(&zs->zil_itx_needcopy_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_count, 0); wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_write, 0); wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0); wmsum_init(&zs->zil_itx_metaslab_slog_count, 0); wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_slog_write, 0); wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0); } void zil_sums_fini(zil_sums_t *zs) { wmsum_fini(&zs->zil_commit_count); wmsum_fini(&zs->zil_commit_writer_count); wmsum_fini(&zs->zil_itx_count); wmsum_fini(&zs->zil_itx_indirect_count); wmsum_fini(&zs->zil_itx_indirect_bytes); wmsum_fini(&zs->zil_itx_copied_count); wmsum_fini(&zs->zil_itx_copied_bytes); wmsum_fini(&zs->zil_itx_needcopy_count); wmsum_fini(&zs->zil_itx_needcopy_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_count); wmsum_fini(&zs->zil_itx_metaslab_normal_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_write); wmsum_fini(&zs->zil_itx_metaslab_normal_alloc); wmsum_fini(&zs->zil_itx_metaslab_slog_count); wmsum_fini(&zs->zil_itx_metaslab_slog_bytes); wmsum_fini(&zs->zil_itx_metaslab_slog_write); wmsum_fini(&zs->zil_itx_metaslab_slog_alloc); } void zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums) { zs->zil_commit_count.value.ui64 = wmsum_value(&zil_sums->zil_commit_count); zs->zil_commit_writer_count.value.ui64 = wmsum_value(&zil_sums->zil_commit_writer_count); zs->zil_itx_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_count); zs->zil_itx_indirect_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_indirect_count); zs->zil_itx_indirect_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_indirect_bytes); zs->zil_itx_copied_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_copied_count); zs->zil_itx_copied_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_copied_bytes); zs->zil_itx_needcopy_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_needcopy_count); zs->zil_itx_needcopy_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_needcopy_bytes); zs->zil_itx_metaslab_normal_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_count); zs->zil_itx_metaslab_normal_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes); zs->zil_itx_metaslab_normal_write.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_write); zs->zil_itx_metaslab_normal_alloc.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc); zs->zil_itx_metaslab_slog_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_count); zs->zil_itx_metaslab_slog_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes); zs->zil_itx_metaslab_slog_write.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_write); zs->zil_itx_metaslab_slog_alloc.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc); } /* * Parse the intent log, and call parse_func for each valid record within. */ int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg, boolean_t decrypt) { const zil_header_t *zh = zilog->zl_header; boolean_t claimed = !!zh->zh_claim_txg; uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; uint64_t max_blk_seq = 0; uint64_t max_lr_seq = 0; uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk = {{{{0}}}}; int error = 0; /* * Old logs didn't record the maximum zh_claim_lr_seq. */ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) claim_lr_seq = UINT64_MAX; /* * Starting at the block pointed to by zh_log we read the log chain. * For each block in the chain we strongly check that block to * ensure its validity. We stop when an invalid block is found. * For each block pointer in the chain we call parse_blk_func(). * For each record in each valid block we call parse_lr_func(). * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; char *lrp, *end; arc_buf_t *abuf = NULL; if (blk_seq > claim_blk_seq) break; error = parse_blk_func(zilog, &blk, arg, txg); if (error != 0) break; ASSERT3U(max_blk_seq, <, blk_seq); max_blk_seq = blk_seq; blk_count++; if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) break; error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, &lrp, &end, &abuf); if (error != 0) { if (abuf) arc_buf_destroy(abuf, &abuf); if (claimed) { char name[ZFS_MAX_DATASET_NAME_LEN]; dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS read log block error %d, " "dataset %s, seq 0x%llx\n", error, name, (u_longlong_t)blk_seq); } break; } for (; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); + ASSERT3U(reclen, <=, end - lrp); if (lr->lrc_seq > claim_lr_seq) { arc_buf_destroy(abuf, &abuf); goto done; } error = parse_lr_func(zilog, lr, arg, txg); if (error != 0) { arc_buf_destroy(abuf, &abuf); goto done; } ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; } arc_buf_destroy(abuf, &abuf); } done: zilog->zl_parse_error = error; zilog->zl_parse_blk_seq = max_blk_seq; zilog->zl_parse_lr_seq = max_lr_seq; zilog->zl_parse_blk_count = blk_count; zilog->zl_parse_lr_count = lr_count; zil_bp_tree_fini(zilog); return (error); } static int zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t first_txg) { (void) tx; ASSERT(!BP_IS_HOLE(bp)); /* * As we call this function from the context of a rewind to a * checkpoint, each ZIL block whose txg is later than the txg * that we rewind to is invalid. Thus, we return -1 so * zil_parse() doesn't attempt to read it. */ if (bp->blk_birth >= first_txg) return (-1); if (zil_bp_tree_add(zilog, bp) != 0) return (0); zio_free(zilog->zl_spa, first_txg, bp); return (0); } static int zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { (void) zilog, (void) lrc, (void) tx, (void) first_txg; return (0); } static int zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t first_txg) { /* * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); return (zio_wait(zio_claim(NULL, zilog->zl_spa, tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); } static int zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { lr_write_t *lr = (lr_write_t *)lrc; int error; - ASSERT(lrc->lrc_txtype == TX_WRITE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); /* * If the block is not readable, don't claim it. This can happen * in normal operation when a log block is written to disk before * some of the dmu_sync() blocks it points to. In this case, the * transaction cannot have been committed to anyone (we would have * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ if (lr->lr_blkptr.blk_birth >= first_txg) { error = zil_read_log_data(zilog, lr, NULL); if (error != 0) return (error); } return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); } static int zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; spa_t *spa; uint_t ii; - ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); if (tx == NULL) { return (0); } /* * XXX: Do we need to byteswap lr? */ spa = zilog->zl_spa; for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; /* * When data in embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as * it contains the data. */ if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { brt_pending_add(spa, bp, tx); } } return (0); } static int zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { switch (lrc->lrc_txtype) { case TX_WRITE: return (zil_claim_write(zilog, lrc, tx, first_txg)); case TX_CLONE_RANGE: return (zil_claim_clone_range(zilog, lrc, tx)); default: return (0); } } static int zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t claim_txg) { (void) claim_txg; zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static int zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; - ASSERT(lrc->lrc_txtype == TX_WRITE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); /* * If we previously claimed it, we need to free it. */ if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) { zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); } return (0); } static int zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; spa_t *spa; uint_t ii; - ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); if (tx == NULL) { return (0); } spa = zilog->zl_spa; for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; if (!BP_IS_HOLE(bp)) { zio_free(spa, dmu_tx_get_txg(tx), bp); } } return (0); } static int zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) { if (claim_txg == 0) { return (0); } switch (lrc->lrc_txtype) { case TX_WRITE: return (zil_free_write(zilog, lrc, tx, claim_txg)); case TX_CLONE_RANGE: return (zil_free_clone_range(zilog, lrc, tx)); default: return (0); } } static int zil_lwb_vdev_compare(const void *x1, const void *x2) { const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; return (TREE_CMP(v1, v2)); } /* * Allocate a new lwb. We may already have a block pointer for it, in which * case we get size and version from there. Or we may not yet, in which case * we choose them here and later make the block allocation match. */ static lwb_t * zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, uint64_t txg, lwb_state_t state) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; if (bp) { lwb->lwb_blk = *bp; lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2); sz = BP_GET_LSIZE(bp); } else { BP_ZERO(&lwb->lwb_blk); lwb->lwb_slim = (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL); } lwb->lwb_slog = slog; lwb->lwb_error = 0; if (lwb->lwb_slim) { lwb->lwb_nmax = sz; lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); } else { lwb->lwb_nmax = sz - sizeof (zil_chain_t); lwb->lwb_nused = lwb->lwb_nfilled = 0; } lwb->lwb_sz = sz; lwb->lwb_state = state; lwb->lwb_buf = zio_buf_alloc(sz); lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; lwb->lwb_alloc_txg = txg; lwb->lwb_max_txg = 0; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); if (state != LWB_STATE_NEW) zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); return (lwb); } static void zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_FLUSH_DONE); ASSERT3P(lwb->lwb_child_zio, ==, NULL); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); VERIFY(list_is_empty(&lwb->lwb_itxs)); VERIFY(list_is_empty(&lwb->lwb_waiters)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); /* * Clear the zilog's field to indicate this lwb is no longer * valid, and prevent use-after-free errors. */ if (zilog->zl_last_lwb_opened == lwb) zilog->zl_last_lwb_opened = NULL; kmem_cache_free(zil_lwb_cache, lwb); } /* * Called when we create in-memory log transactions so that we know * to cleanup the itxs at the end of spa_sync(). */ static void zilog_dirty(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); ASSERT(spa_writeable(zilog->zl_spa)); if (ds->ds_is_snapshot) panic("dirtying snapshot!"); if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, zilog); zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); } } /* * Determine if the zil is dirty in the specified txg. Callers wanting to * ensure that the dirty state does not change must hold the itxg_lock for * the specified txg. Holding the lock will ensure that the zil cannot be * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current * state. */ static boolean_t __maybe_unused zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) return (B_TRUE); return (B_FALSE); } /* * Determine if the zil is dirty. The zil is considered dirty if it has * any pending itx records that have not been cleaned by zil_clean(). */ static boolean_t zilog_is_dirty(zilog_t *zilog) { dsl_pool_t *dp = zilog->zl_dmu_pool; for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) return (B_TRUE); } return (B_FALSE); } /* * Its called in zil_commit context (zil_process_commit_list()/zil_create()). * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled. * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every * zil_commit. */ static void zil_commit_activate_saxattr_feature(zilog_t *zilog) { dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); uint64_t txg = 0; dmu_tx_t *tx = NULL; if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL && !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) { tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(ds, tx); txg = dmu_tx_get_txg(tx); mutex_enter(&ds->ds_lock); ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = (void *)B_TRUE; mutex_exit(&ds->ds_lock); dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } } /* * Create an on-disk intent log. */ static lwb_t * zil_create(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb = NULL; uint64_t txg = 0; dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; boolean_t slog = FALSE; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ASSERT(zh->zh_claim_txg == 0); ASSERT(zh->zh_replay_seq == 0); blk = zh->zh_log; /* * Allocate an initial log block if: * - there isn't one already * - the existing block is the wrong endianness */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); if (!BP_IS_HOLE(&blk)) { zio_free(zilog->zl_spa, txg, &blk); BP_ZERO(&blk); } error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, ZIL_MIN_BLKSZ, &slog); if (error == 0) zil_init_log_chain(zilog, &blk); } /* * Allocate a log write block (lwb) for the first log block. */ if (error == 0) lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW); /* * If we just allocated the first log block, commit our transaction * and wait for zil_sync() to stuff the block pointer into zh_log. * (zh is part of the MOS, so we cannot modify it in open context.) */ if (tx != NULL) { /* * If "zilsaxattr" feature is enabled on zpool, then activate * it now when we're creating the ZIL chain. We can't wait with * this until we write the first xattr log record because we * need to wait for the feature activation to sync out. */ if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL) { mutex_enter(&ds->ds_lock); ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = (void *)B_TRUE; mutex_exit(&ds->ds_lock); } dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } else { /* * This branch covers the case where we enable the feature on a * zpool that has existing ZIL headers. */ zil_commit_activate_saxattr_feature(zilog); } IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL, dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)); ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); IMPLY(error == 0, lwb != NULL); return (lwb); } /* * In one tx, free all log blocks and clear the log header. If keep_first * is set, then we're replaying a log with no content. We want to keep the * first block, however, so that the first synchronous transaction doesn't * require a txg_wait_synced() in zil_create(). We don't need to * txg_wait_synced() here either when keep_first is set, because both * zil_create() and zil_destroy() will wait for any in-progress destroys * to complete. * Return B_TRUE if there were any entries to replay. */ boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_old_header = *zh; /* debugging aid */ if (BP_IS_HOLE(&zh->zh_log)) return (B_FALSE); tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); mutex_enter(&zilog->zl_lock); ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; zilog->zl_keep_first = keep_first; if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); if (!BP_IS_HOLE(&lwb->lwb_blk)) zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); } } else if (!keep_first) { zil_destroy_sync(zilog, tx); } mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); return (B_TRUE); } void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE); } int zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) { dmu_tx_t *tx = txarg; zilog_t *zilog; uint64_t first_txg; zil_header_t *zh; objset_t *os; int error; error = dmu_objset_own_obj(dp, ds->ds_object, DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os); if (error != 0) { /* * EBUSY indicates that the objset is inconsistent, in which * case it can not have a ZIL. */ if (error != EBUSY) { cmn_err(CE_WARN, "can't open objset for %llu, error %u", (unsigned long long)ds->ds_object, error); } return (0); } zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); first_txg = spa_min_claim_txg(zilog->zl_spa); /* * If the spa_log_state is not set to be cleared, check whether * the current uberblock is a checkpoint one and if the current * header has been claimed before moving on. * * If the current uberblock is a checkpointed uberblock then * one of the following scenarios took place: * * 1] We are currently rewinding to the checkpoint of the pool. * 2] We crashed in the middle of a checkpoint rewind but we * did manage to write the checkpointed uberblock to the * vdev labels, so when we tried to import the pool again * the checkpointed uberblock was selected from the import * procedure. * * In both cases we want to zero out all the ZIL blocks, except * the ones that have been claimed at the time of the checkpoint * (their zh_claim_txg != 0). The reason is that these blocks * may be corrupted since we may have reused their locations on * disk after we took the checkpoint. * * We could try to set spa_log_state to SPA_LOG_CLEAR earlier * when we first figure out whether the current uberblock is * checkpointed or not. Unfortunately, that would discard all * the logs, including the ones that are claimed, and we would * leak space. */ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && zh->zh_claim_txg == 0)) { if (!BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_clear_log_block, zil_noop_log_record, tx, first_txg, B_FALSE); } BP_ZERO(&zh->zh_log); if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; dsl_dataset_dirty(dmu_objset_ds(os), tx); dmu_objset_disown(os, B_FALSE, FTAG); return (0); } /* * If we are not rewinding and opening the pool normally, then * the min_claim_txg should be equal to the first txg of the pool. */ ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can * read only part of the log now (e.g. due to a missing device), * but we can read the entire log later, we will not try to replay * or destroy beyond the last block we successfully claimed. */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, first_txg, B_FALSE); zh->zh_claim_txg = first_txg; zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) zh->zh_flags |= ZIL_REPLAY_NEEDED; zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; dsl_dataset_dirty(dmu_objset_ds(os), tx); } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); dmu_objset_disown(os, B_FALSE, FTAG); return (0); } /* * Check the log by walking the log chain. * Checksum errors are ok as they indicate the end of the chain. * Any other error (no device or read failure) returns an error. */ int zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) { (void) dp; zilog_t *zilog; objset_t *os; blkptr_t *bp; int error; ASSERT(tx == NULL); error = dmu_objset_from_ds(ds, &os); if (error != 0) { cmn_err(CE_WARN, "can't open objset %llu, error %d", (unsigned long long)ds->ds_object, error); return (0); } zilog = dmu_objset_zil(os); bp = (blkptr_t *)&zilog->zl_header->zh_log; if (!BP_IS_HOLE(bp)) { vdev_t *vd; boolean_t valid = B_TRUE; /* * Check the first block and determine if it's on a log device * which may have been removed or faulted prior to loading this * pool. If so, there's no point in checking the rest of the * log as its content should have already been synced to the * pool. */ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); if (vd->vdev_islog && vdev_is_dead(vd)) valid = vdev_log_state_valid(vd); spa_config_exit(os->os_spa, SCL_STATE, FTAG); if (!valid) return (0); /* * Check whether the current uberblock is checkpointed (e.g. * we are rewinding) and whether the current header has been * claimed or not. If it hasn't then skip verifying it. We * do this because its ZIL blocks may be part of the pool's * state before the rewind, which is no longer valid. */ zil_header_t *zh = zil_header_in_syncing_context(zilog); if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && zh->zh_claim_txg == 0) return (0); } /* * Because tx == NULL, zil_claim_log_block() will not actually claim * any blocks, but just determine whether it is possible to do so. * In addition to checking the log chain, zil_claim_log_block() * will invoke zio_claim() with a done func of spa_claim_notify(), * which will update spa_max_claim_txg. See spa_load() for details. */ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, zilog->zl_header->zh_claim_txg ? -1ULL : spa_min_claim_txg(os->os_spa), B_FALSE); return ((error == ECKSUM || error == ENOENT) ? 0 : error); } /* * When an itx is "skipped", this function is used to properly mark the * waiter as "done, and signal any thread(s) waiting on it. An itx can * be skipped (and not committed to an lwb) for a variety of reasons, * one of them being that the itx was committed via spa_sync(), prior to * it being committed to an lwb; this can happen if a thread calling * zil_commit() is racing with spa_sync(). */ static void zil_commit_waiter_skip(zil_commit_waiter_t *zcw) { mutex_enter(&zcw->zcw_lock); ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); } /* * This function is used when the given waiter is to be linked into an * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. * At this point, the waiter will no longer be referenced by the itx, * and instead, will be referenced by the lwb. */ static void zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { /* * The lwb_waiters field of the lwb is protected by the zilog's * zl_issuer_lock while the lwb is open and zl_lock otherwise. * zl_issuer_lock also protects leaving the open state. * zcw_lwb setting is protected by zl_issuer_lock and state != * flush_done, which transition is protected by zl_lock. */ ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock)); IMPLY(lwb->lwb_state != LWB_STATE_OPENED, MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(&lwb->lwb_waiters, zcw); ASSERT3P(zcw->zcw_lwb, ==, NULL); zcw->zcw_lwb = lwb; } /* * This function is used when zio_alloc_zil() fails to allocate a ZIL * block, and the given waiter must be linked to the "nolwb waiters" * list inside of zil_process_commit_list(). */ static void zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) { ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(nolwb, zcw); ASSERT3P(zcw->zcw_lwb, ==, NULL); } void zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) { avl_tree_t *t = &lwb->lwb_vdev_tree; avl_index_t where; zil_vdev_node_t *zv, zvsearch; int ndvas = BP_GET_NDVAS(bp); int i; ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); if (zil_nocacheflush) return; mutex_enter(&lwb->lwb_vdev_lock); for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { zv = kmem_alloc(sizeof (*zv), KM_SLEEP); zv->zv_vdev = zvsearch.zv_vdev; avl_insert(t, zv, where); } } mutex_exit(&lwb->lwb_vdev_lock); } static void zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) { avl_tree_t *src = &lwb->lwb_vdev_tree; avl_tree_t *dst = &nlwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); /* * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does * not need the protection of lwb_vdev_lock (it will only be modified * while holding zilog->zl_lock) as its writes and those of its * children have all completed. The younger 'nlwb' may be waiting on * future writes to additional vdevs. */ mutex_enter(&nlwb->lwb_vdev_lock); /* * Tear down the 'lwb' vdev tree, ensuring that entries which do not * exist in 'nlwb' are moved to it, freeing any would-be duplicates. */ while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { avl_index_t where; if (avl_find(dst, zv, &where) == NULL) { avl_insert(dst, zv, where); } else { kmem_free(zv, sizeof (*zv)); } } mutex_exit(&nlwb->lwb_vdev_lock); } void zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) { lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); } /* * This function is a called after all vdevs associated with a given lwb * write have completed their DKIOCFLUSHWRITECACHE command; or as soon * as the lwb write completes, if "zil_nocacheflush" is set. Further, * all "previous" lwb's will have completed before this function is * called; i.e. this function is called for all previous lwbs before * it's called for "this" lwb (enforced via zio the dependencies * configured in zil_lwb_set_zio_dependency()). * * The intention is for this function to be called as soon as the * contents of an lwb are considered "stable" on disk, and will survive * any sudden loss of power. At this point, any threads waiting for the * lwb to reach this state are signalled, and the "waiter" structures * are marked "done". */ static void zil_lwb_flush_vdevs_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; zil_commit_waiter_t *zcw; itx_t *itx; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8; lwb->lwb_root_zio = NULL; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); lwb->lwb_state = LWB_STATE_FLUSH_DONE; if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number * for ztest. We only update this value when all the log * writes succeeded, because ztest wants to ASSERT that * it got the whole log chain. */ zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been * propagated "up" to this specific LWB's root ZIO, in * order for this error handling to work correctly. This * includes ZIO errors from either this LWB's write or * flush, as well as any errors from other dependent LWBs * (e.g. a root LWB ZIO that might be a child of this LWB). * * With that said, it's important to note that LWB flush * errors are not propagated up to the LWB root ZIO. * This is incorrect behavior, and results in VDEV flush * errors not being handled correctly here. See the * comment above the call to "zio_flush" for details. */ zcw->zcw_zio_error = zio->io_error; ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); } uint64_t txg = lwb->lwb_issued_txg; /* Once we drop the lock, lwb may be freed by zil_sync(). */ mutex_exit(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); zilog->zl_lwb_inflight[txg & TXG_MASK]--; if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0) cv_broadcast(&zilog->zl_lwb_io_cv); mutex_exit(&zilog->zl_lwb_io_lock); } /* * Wait for the completion of all issued write/flush of that txg provided. * It guarantees zil_lwb_flush_vdevs_done() is called and returned. */ static void zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) { ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa)); mutex_enter(&zilog->zl_lwb_io_lock); while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0) cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock); mutex_exit(&zilog->zl_lwb_io_lock); #ifdef ZFS_DEBUG mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); lwb_t *lwb = list_head(&zilog->zl_lwb_list); while (lwb != NULL) { if (lwb->lwb_issued_txg <= txg) { ASSERT(lwb->lwb_state != LWB_STATE_ISSUED); ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE); IMPLY(lwb->lwb_issued_txg > 0, lwb->lwb_state == LWB_STATE_FLUSH_DONE); } IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE, lwb->lwb_buf == NULL); lwb = list_next(&zilog->zl_lwb_list, lwb); } mutex_exit(&zilog->zl_lwb_io_lock); mutex_exit(&zilog->zl_lock); #endif } /* * This is called when an lwb's write zio completes. The callback's * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved * in writing out this specific lwb's data, and in the case that cache * flushes have been deferred, vdevs involved in writing the data for * previous lwbs. The writes corresponding to all the vdevs in the * lwb_vdev_tree will have completed by the time this is called, due to * the zio dependencies configured in zil_lwb_set_zio_dependency(), * which takes deferred flushes into account. The lwb will be "done" * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio * completion callback for the lwb's root zio. */ static void zil_lwb_write_done(zio_t *zio) { lwb_t *lwb = zio->io_private; spa_t *spa = zio->io_spa; zilog_t *zilog = lwb->lwb_zilog; avl_tree_t *t = &lwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; lwb_t *nlwb; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); abd_free(zio->io_abd); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); lwb->lwb_buf = NULL; mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); lwb->lwb_state = LWB_STATE_WRITE_DONE; lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; /* * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not * called for it yet, and when it will be, it won't be able to make * its write ZIO a parent this ZIO. In such case we can not defer * our flushes or below may be a race between the done callbacks. */ nlwb = list_next(&zilog->zl_lwb_list, lwb); if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED) nlwb = NULL; mutex_exit(&zilog->zl_lock); if (avl_numnodes(t) == 0) return; /* * If there was an IO error, we're not going to call zio_flush() * on these vdevs, so we simply empty the tree and free the * nodes. We avoid calling zio_flush() since there isn't any * good reason for doing so, after the lwb block failed to be * written out. * * Additionally, we don't perform any further error handling at * this point (e.g. setting "zcw_zio_error" appropriately), as * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus, * we expect any error seen here, to have been propagated to * that function). */ if (zio->io_error != 0) { while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zv, sizeof (*zv)); return; } /* * If this lwb does not have any threads waiting for it to * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE * command to the vdevs written to by "this" lwb, and instead * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE * command for those vdevs. Thus, we merge the vdev tree of * "this" lwb with the vdev tree of the "next" lwb in the list, * and assume the "next" lwb will handle flushing the vdevs (or * deferring the flush(s) again). * * This is a useful performance optimization, especially for * workloads with lots of async write activity and few sync * write and/or fsync activity, as it has the potential to * coalesce multiple flush commands to a vdev into one. */ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { zil_lwb_flush_defer(lwb, nlwb); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); return; } while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); if (vd != NULL) { /* * The "ZIO_FLAG_DONT_PROPAGATE" is currently * always used within "zio_flush". This means, * any errors when flushing the vdev(s), will * (unfortunately) not be handled correctly, * since these "zio_flush" errors will not be * propagated up to "zil_lwb_flush_vdevs_done". */ zio_flush(lwb->lwb_root_zio, vd); } kmem_free(zv, sizeof (*zv)); } } /* * Build the zio dependency chain, which is used to preserve the ordering of * lwb completions that is required by the semantics of the ZIL. Each new lwb * zio becomes a parent of the previous lwb zio, such that the new lwb's zio * cannot complete until the previous lwb's zio completes. * * This is required by the semantics of zil_commit(): the commit waiters * attached to the lwbs will be woken in the lwb zio's completion callback, * so this zio dependency graph ensures the waiters are woken in the correct * order (the same order the lwbs were created). */ static void zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb); if (prev_lwb == NULL || prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE) return; /* * If the previous lwb's write hasn't already completed, we also want * to order the completion of the lwb write zios (above, we only order * the completion of the lwb root zios). This is required because of * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb. * * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous * lwb will rely on this lwb to flush the vdevs written to by that * previous lwb. Thus, we need to ensure this lwb doesn't issue the * flush until after the previous lwb's write completes. We ensure * this ordering by setting the zio parent/child relationship here. * * Without this relationship on the lwb's write zio, it's possible * for this lwb's write to complete prior to the previous lwb's write * completing; and thus, the vdevs for the previous lwb would be * flushed prior to that lwb's data being written to those vdevs (the * vdevs are flushed in the lwb write zio's completion handler, * zil_lwb_write_done()). */ if (prev_lwb->lwb_state == LWB_STATE_ISSUED) { ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL); zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio); } else { ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); } ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL); zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio); } /* * This function's purpose is to "open" an lwb such that it is ready to * accept new itxs being committed to it. This function is idempotent; if * the passed in lwb has already been opened, it is essentially a no-op. */ static void zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); if (lwb->lwb_state != LWB_STATE_NEW) { ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); return; } mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_OPENED; zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); } /* * Define a limited set of intent log block sizes. * * These must be a multiple of 4KB. Note only the amount used (again * aligned to 4KB) actually gets written. However, we can't always just * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. */ static const struct { uint64_t limit; uint64_t blksz; } zil_block_buckets[] = { { 4096, 4096 }, /* non TX_WRITE */ { 8192 + 4096, 8192 + 4096 }, /* database */ { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ { 131072, 131072 }, /* < 128KB writes */ { 131072 +4096, 65536 + 4096 }, /* 128KB writes */ { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ }; /* * Maximum block size used by the ZIL. This is picked up when the ZIL is * initialized. Otherwise this should not be used directly; see * zl_max_block_size instead. */ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; /* * Close the log block for being issued and allocate the next one. * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) { int i; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); lwb->lwb_state = LWB_STATE_CLOSED; /* * If there was an allocation failure then returned NULL will trigger * zil_commit_writer_stall() at the caller. This is inherently racy, * since allocation may not have happened yet. */ if (lwb->lwb_error != 0) return (NULL); /* * Log blocks are pre-allocated. Here we select the size of the next * block, based on size used in the last block. * - first find the smallest bucket that will fit the block from a * limited set of block sizes. This is because it's faster to write * blocks allocated from the same metaslab as they are adjacent or * close. * - next find the maximum from the new suggested size and an array of * previous sizes. This lessens a picket fence effect of wrongly * guessing the size if we have a stream of say 2k, 64k, 2k, 64k * requests. * * Note we only write what is used, but we can't just allocate * the maximum block size because we can exhaust the available * pool log space. */ uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) continue; zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, zil_blksz, uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state)); } /* * Finalize previously closed block and issue the write zio. */ static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { spa_t *spa = zilog->zl_spa; zil_chain_t *zilc; boolean_t slog; zbookmark_phys_t zb; zio_priority_t prio; int error; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); /* Actually fill the lwb with the data. */ for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; itx = list_next(&lwb->lwb_itxs, itx)) zil_lwb_commit(zilog, lwb, itx); lwb->lwb_nused = lwb->lwb_nfilled; + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); /* * The lwb is now ready to be issued, but it can be only if it already * got its block pointer allocated or the allocation has failed. * Otherwise leave it as-is, relying on some other thread to issue it * after allocating its block pointer via calling zil_lwb_write_issue() * for the previous lwb(s) in the chain. */ mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_READY; if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) { mutex_exit(&zilog->zl_lock); return; } mutex_exit(&zilog->zl_lock); next_lwb: if (lwb->lwb_slim) zilc = (zil_chain_t *)lwb->lwb_buf; else zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax); int wsz = lwb->lwb_sz; if (lwb->lwb_error == 0) { abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) prio = ZIO_PRIORITY_SYNC_WRITE; else prio = ZIO_PRIORITY_ASYNC_WRITE; SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb); zil_lwb_add_block(lwb, &lwb->lwb_blk); if (lwb->lwb_slim) { /* For Slim ZIL only write what is used. */ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); ASSERT3S(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; } memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; } else { /* * We can't write the lwb if there was an allocation failure, * so create a null zio instead just to maintain dependencies. */ lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL, zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL); lwb->lwb_write_zio->io_error = lwb->lwb_error; } if (lwb->lwb_child_zio) zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio); /* * Open transaction to allocate the next block pointer. */ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); uint64_t txg = dmu_tx_get_txg(tx); /* * Allocate next the block pointer unless we are already in error. */ lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb); blkptr_t *bp = &zilc->zc_next_blk; BP_ZERO(bp); error = lwb->lwb_error; if (error == 0) { error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz, &slog); } if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; } /* * Reduce TXG open time by incrementing inflight counter and committing * the transaciton. zil_sync() will wait for it to return to zero. */ mutex_enter(&zilog->zl_lwb_io_lock); lwb->lwb_issued_txg = txg; zilog->zl_lwb_inflight[txg & TXG_MASK]++; zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); mutex_exit(&zilog->zl_lwb_io_lock); dmu_tx_commit(tx); spa_config_enter(spa, SCL_STATE, lwb, RW_READER); /* * We've completed all potentially blocking operations. Update the * nlwb and allow it proceed without possible lock order reversals. */ mutex_enter(&zilog->zl_lock); zil_lwb_set_zio_dependency(zilog, lwb); lwb->lwb_state = LWB_STATE_ISSUED; if (nlwb) { nlwb->lwb_blk = *bp; nlwb->lwb_error = error; nlwb->lwb_slog = slog; nlwb->lwb_alloc_txg = txg; if (nlwb->lwb_state != LWB_STATE_READY) nlwb = NULL; } mutex_exit(&zilog->zl_lock); if (lwb->lwb_slog) { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, lwb->lwb_nused); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write, wsz); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc, BP_GET_LSIZE(&lwb->lwb_blk)); } else { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, lwb->lwb_nused); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write, wsz); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, BP_GET_LSIZE(&lwb->lwb_blk)); } lwb->lwb_issued_timestamp = gethrtime(); if (lwb->lwb_child_zio) zio_nowait(lwb->lwb_child_zio); zio_nowait(lwb->lwb_write_zio); zio_nowait(lwb->lwb_root_zio); /* * If nlwb was ready when we gave it the block pointer, * it is on us to issue it and possibly following ones. */ lwb = nlwb; if (lwb) goto next_lwb; } /* * Maximum amount of data that can be put into single log block. */ uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize) { return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize); } /* * Maximum amount of log space we agree to waste to reduce number of * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%). */ static inline uint64_t zil_max_waste_space(zilog_t *zilog) { return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16); } /* * Maximum amount of write data for WR_COPIED. For correctness, consumers * must fall back to WR_NEED_COPY if we can't fit the entire record into one * maximum sized log block, because each WR_COPIED record must fit in a * single log block. Below that it is a tradeoff of additional memory copy * and possibly worse log space efficiency vs additional range lock/unlock. */ static uint_t zil_maxcopied = 7680; uint64_t zil_max_copied_data(zilog_t *zilog) { uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t)); return (MIN(max_data, zil_maxcopied)); } /* * Estimate space needed in the lwb for the itx. Allocate more lwbs or * split the itx as needed, but don't touch the actual transaction data. * Has to be called under zl_issuer_lock to call zil_lwb_write_close() * to chain more lwbs. */ static lwb_t * zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) { itx_t *citx; lr_t *lr, *clr; lr_write_t *lrw; uint64_t dlen, dnow, lwb_sp, reclen, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); ASSERT3P(lwb->lwb_buf, !=, NULL); zil_lwb_write_open(zilog, lwb); lr = &itx->itx_lr; lrw = (lr_write_t *)lr; /* * A commit itx doesn't represent any on-disk state; instead * it's simply used as a place holder on the commit list, and * provides a mechanism for attaching a "commit waiter" onto the * correct lwb (such that the waiter can be signalled upon * completion of that lwb). Thus, we don't process this itx's * log record if it's a commit itx (these itx's don't have log * records), and instead link the itx's waiter onto the lwb's * list of waiters. * * For more details, see the comment above zil_commit(). */ if (lr->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_lwb(itx->itx_private, lwb); list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } + reclen = lr->lrc_reclen; if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + ASSERT3U(reclen, ==, sizeof (lr_write_t)); dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); } else { + ASSERT3U(reclen, >=, sizeof (lr_t)); dlen = 0; } - reclen = lr->lrc_reclen; + ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0)); zilog->zl_cur_used += (reclen + dlen); cont: /* * If this record won't fit in the current log block, start a new one. * For WR_NEED_COPY optimize layout for minimal number of chunks. */ lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t)); if (reclen > lwb_sp || (reclen + dlen > lwb_sp && lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED); if (lwb == NULL) return (NULL); lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; - - /* - * There must be enough space in the new, empty log block to - * hold reclen. For WR_COPIED, we need to fit the whole - * record in one block, and reclen is the header size + the - * data size. For WR_NEED_COPY, we can create multiple - * records, splitting the data into multiple blocks, so we - * only need to fit one word of data per block; in this case - * reclen is just the header size (no data). - */ - ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } + /* + * There must be enough space in the log block to hold reclen. + * For WR_COPIED, we need to fit the whole record in one block, + * and reclen is the write record header size + the data size. + * For WR_NEED_COPY, we can create multiple records, splitting + * the data into multiple blocks, so we only need to fit one + * word of data per block; in this case reclen is just the header + * size (no data). + */ + ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); + dnow = MIN(dlen, lwb_sp - reclen); if (dlen > dnow) { ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY); citx = zil_itx_clone(itx); clr = &citx->itx_lr; lr_write_t *clrw = (lr_write_t *)clr; clrw->lr_length = dnow; lrw->lr_offset += dnow; lrw->lr_length -= dnow; } else { citx = itx; clr = lr; } /* * We're actually making an entry, so update lrc_seq to be the * log record sequence number. Note that this is generally not * equal to the itx sequence number because not all transactions * are synchronous, and sometimes spa_sync() gets there first. */ clr->lrc_seq = ++zilog->zl_lr_seq; lwb->lwb_nused += reclen + dnow; ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); zil_lwb_add_txg(lwb, lr->lrc_txg); list_insert_tail(&lwb->lwb_itxs, citx); dlen -= dnow; if (dlen > 0) { zilog->zl_cur_used += reclen; goto cont; } if (lr->lrc_txtype == TX_WRITE && lr->lrc_txg > spa_freeze_txg(zilog->zl_spa)) txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); return (lwb); } /* * Fill the actual transaction data into the lwb, following zil_lwb_assign(). * Does not require locking. */ static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) { lr_t *lr, *lrb; lr_write_t *lrw, *lrwb; char *lr_buf; uint64_t dlen, reclen; lr = &itx->itx_lr; lrw = (lr_write_t *)lr; if (lr->lrc_txtype == TX_COMMIT) return; if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); } else { dlen = 0; } reclen = lr->lrc_reclen; ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; memcpy(lr_buf, lr, reclen); lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */ lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */ ZIL_STAT_BUMP(zilog, zil_itx_count); /* * If it's a write, fetch the data or get its blkptr as appropriate. */ if (lr->lrc_txtype == TX_WRITE) { if (itx->itx_wr_state == WR_COPIED) { ZIL_STAT_BUMP(zilog, zil_itx_copied_count); ZIL_STAT_INCR(zilog, zil_itx_copied_bytes, lrw->lr_length); } else { char *dbuf; int error; if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; lrb->lrc_reclen += dlen; ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count); ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes, dlen); } else { ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); dbuf = NULL; ZIL_STAT_BUMP(zilog, zil_itx_indirect_count); ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes, lrw->lr_length); if (lwb->lwb_child_zio == NULL) { lwb->lwb_child_zio = zio_null(NULL, zilog->zl_spa, NULL, NULL, NULL, ZIO_FLAG_CANFAIL); } } /* * The "lwb_child_zio" we pass in will become a child of * "lwb_write_zio", when one is created, so one will be * a parent of any zio's created by the "zl_get_data". * This way "lwb_write_zio" will first wait for children * block pointers before own writing, and then for their * writing completion before the vdev cache flushing. */ error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, lwb->lwb_child_zio); if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ memset((char *)dbuf + lrwb->lr_length, 0, dlen - lrwb->lr_length); } /* * Typically, the only return values we should see from * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or * EALREADY. However, it is also possible to see other * error values such as ENOSPC or EINVAL from * dmu_read() -> dnode_hold() -> dnode_hold_impl() or * ENXIO as well as a multitude of others from the * block layer through dmu_buf_hold() -> dbuf_read() * -> zio_wait(), as well as through dmu_read() -> * dnode_hold() -> dnode_hold_impl() -> dbuf_read() -> * zio_wait(). When these errors happen, we can assume * that neither an immediate write nor an indirect * write occurred, so we need to fall back to * txg_wait_synced(). This is unusual, so we print to * dmesg whenever one of these errors occurs. */ switch (error) { case 0: break; default: cmn_err(CE_WARN, "zil_lwb_commit() received " "unexpected error %d from ->zl_get_data()" ". Falling back to txg_wait_synced().", error); zfs_fallthrough; case EIO: txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); zfs_fallthrough; case ENOENT: zfs_fallthrough; case EEXIST: zfs_fallthrough; case EALREADY: return; } } } lwb->lwb_nfilled += reclen + dlen; ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); } itx_t * zil_itx_create(uint64_t txtype, size_t olrsize) { size_t itxsize, lrsize; itx_t *itx; + ASSERT3U(olrsize, >=, sizeof (lr_t)); lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t); + ASSERT3U(lrsize, >=, olrsize); itxsize = offsetof(itx_t, itx_lr) + lrsize; itx = zio_data_buf_alloc(itxsize); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; itx->itx_lr.lrc_seq = 0; /* defensive */ memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize); itx->itx_sync = B_TRUE; /* default is synchronous */ itx->itx_callback = NULL; itx->itx_callback_data = NULL; itx->itx_size = itxsize; return (itx); } static itx_t * zil_itx_clone(itx_t *oitx) { + ASSERT3U(oitx->itx_size, >=, sizeof (itx_t)); + ASSERT3U(oitx->itx_size, ==, + offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen); + itx_t *itx = zio_data_buf_alloc(oitx->itx_size); memcpy(itx, oitx, oitx->itx_size); itx->itx_callback = NULL; itx->itx_callback_data = NULL; return (itx); } void zil_itx_destroy(itx_t *itx) { + ASSERT3U(itx->itx_size, >=, sizeof (itx_t)); + ASSERT3U(itx->itx_lr.lrc_reclen, ==, + itx->itx_size - offsetof(itx_t, itx_lr)); IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL); IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); if (itx->itx_callback != NULL) itx->itx_callback(itx->itx_callback_data); zio_data_buf_free(itx, itx->itx_size); } /* * Free up the sync and async itxs. The itxs_t has already been detached * so no locks are needed. */ static void zil_itxg_clean(void *arg) { itx_t *itx; list_t *list; avl_tree_t *t; void *cookie; itxs_t *itxs = arg; itx_async_node_t *ian; list = &itxs->i_sync_list; while ((itx = list_remove_head(list)) != NULL) { /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via * zil_lwb_assign(), and free'd in that function. Having * said that, it is still possible for commit itxs to be * found here, due to the following race: * * - a thread calls zil_commit() which assigns the * commit itx to a per-txg i_sync_list * - zil_itxg_clean() is called (e.g. via spa_sync()) * while the waiter is still on the i_sync_list * * There's nothing to prevent syncing the txg while the * waiter is on the i_sync_list. This normally doesn't * happen because spa_sync() is slower than zil_commit(), * but if zil_commit() calls txg_wait_synced() (e.g. * because zil_create() or zil_commit_writer_stall() is * called) we will hit this case. */ if (itx->itx_lr.lrc_txtype == TX_COMMIT) zil_commit_waiter_skip(itx->itx_private); zil_itx_destroy(itx); } cookie = NULL; t = &itxs->i_async_tree; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; while ((itx = list_remove_head(list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } list_destroy(list); kmem_free(ian, sizeof (itx_async_node_t)); } avl_destroy(t); kmem_free(itxs, sizeof (itxs_t)); } static int zil_aitx_compare(const void *x1, const void *x2) { const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; return (TREE_CMP(o1, o2)); } /* * Remove all async itx with the given oid. */ void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; - itx_async_node_t *ian; + itx_async_node_t *ian, ian_search; avl_tree_t *t; avl_index_t where; list_t clean_list; itx_t *itx; ASSERT(oid != 0); list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; - ian = avl_find(t, &oid, &where); + ian_search.ia_foid = oid; + ian = avl_find(t, &ian_search, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); } while ((itx = list_remove_head(&clean_list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } list_destroy(&clean_list); } void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) { uint64_t txg; itxg_t *itxg; itxs_t *itxs, *clean = NULL; /* * Ensure the data of a renamed file is committed before the rename. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) zil_async_to_sync(zilog, itx->itx_oid); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) txg = ZILTEST_TXG; else txg = dmu_tx_get_txg(tx); itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); itxs = itxg->itxg_itxs; if (itxg->itxg_txg != txg) { if (itxs != NULL) { /* * The zil_clean callback hasn't got around to cleaning * this itxg. Save the itxs for release below. * This should be rare. */ zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " "txg %llu", (u_longlong_t)itxg->itxg_txg); clean = itxg->itxg_itxs; } itxg->itxg_txg = txg; itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); list_create(&itxs->i_sync_list, sizeof (itx_t), offsetof(itx_t, itx_node)); avl_create(&itxs->i_async_tree, zil_aitx_compare, sizeof (itx_async_node_t), offsetof(itx_async_node_t, ia_node)); } if (itx->itx_sync) { list_insert_tail(&itxs->i_sync_list, itx); } else { avl_tree_t *t = &itxs->i_async_tree; uint64_t foid = LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid); itx_async_node_t *ian; avl_index_t where; ian = avl_find(t, &foid, &where); if (ian == NULL) { ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); list_create(&ian->ia_list, sizeof (itx_t), offsetof(itx_t, itx_node)); ian->ia_foid = foid; avl_insert(t, ian, where); } list_insert_tail(&ian->ia_list, itx); } itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); /* * We don't want to dirty the ZIL using ZILTEST_TXG, because * zil_clean() will never be called using ZILTEST_TXG. Thus, we * need to be careful to always dirty the ZIL using the "real" * TXG (not itxg_txg) even when the SPA is frozen. */ zilog_dirty(zilog, dmu_tx_get_txg(tx)); mutex_exit(&itxg->itxg_lock); /* Release the old itxs now we've dropped the lock */ if (clean != NULL) zil_itxg_clean(clean); } /* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. We should only do this after we * have written out the uberblocks (i.e. txg has been committed) so that * don't inadvertently clean out in-memory log records that would be required * by zil_commit(). */ void zil_clean(zilog_t *zilog, uint64_t synced_txg) { itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; itxs_t *clean_me; ASSERT3U(synced_txg, <, ZILTEST_TXG); mutex_enter(&itxg->itxg_lock); if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); return; } ASSERT3U(itxg->itxg_txg, <=, synced_txg); ASSERT3U(itxg->itxg_txg, !=, 0); clean_me = itxg->itxg_itxs; itxg->itxg_itxs = NULL; itxg->itxg_txg = 0; mutex_exit(&itxg->itxg_lock); /* * Preferably start a task queue to free up the old itxs but * if taskq_dispatch can't allocate resources to do that then * free it in-line. This should be rare. Note, using TQ_SLEEP * created a bad performance problem. */ ASSERT3P(zilog->zl_dmu_pool, !=, NULL); ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, zil_itxg_clean, clean_me, TQ_NOSLEEP); if (id == TASKQID_INVALID) zil_itxg_clean(clean_me); } /* * This function will traverse the queue of itxs that need to be * committed, and move them onto the ZIL's zl_itx_commit_list. */ static uint64_t zil_get_commit_list(zilog_t *zilog) { uint64_t otxg, txg, wtxg = 0; list_t *commit_list = &zilog->zl_itx_commit_list; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; /* * This is inherently racy, since there is nothing to prevent * the last synced txg from changing. That's okay since we'll * only commit things in the future. */ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If we're adding itx records to the zl_itx_commit_list, * then the zil better be dirty in this "txg". We can assert * that here since we're holding the itxg_lock which will * prevent spa_sync from cleaning it. Once we add the itxs * to the zl_itx_commit_list we must commit it to disk even * if it's unnecessary (i.e. the txg was synced). */ ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); list_t *sync_list = &itxg->itxg_itxs->i_sync_list; if (unlikely(zilog->zl_suspend > 0)) { /* * ZIL was just suspended, but we lost the race. * Allow all earlier itxs to be committed, but ask * caller to do txg_wait_synced(txg) for any new. */ if (!list_is_empty(sync_list)) wtxg = MAX(wtxg, txg); } else { list_move_tail(commit_list, sync_list); } mutex_exit(&itxg->itxg_lock); } return (wtxg); } /* * Move the async itxs for a specified object to commit into sync lists. */ void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; - itx_async_node_t *ian; + itx_async_node_t *ian, ian_search; avl_tree_t *t; avl_index_t where; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; /* * This is inherently racy, since there is nothing to prevent * the last synced txg from changing. */ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If a foid is specified then find that node and append its * list. Otherwise walk the tree appending all the lists * to the sync list. We add to the end rather than the * beginning to ensure the create has happened. */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { - ian = avl_find(t, &foid, &where); + ian_search.ia_foid = foid; + ian = avl_find(t, &ian_search, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); } } else { void *cookie = NULL; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); list_destroy(&ian->ia_list); kmem_free(ian, sizeof (itx_async_node_t)); } } mutex_exit(&itxg->itxg_lock); } } /* * This function will prune commit itxs that are at the head of the * commit list (it won't prune past the first non-commit itx), and * either: a) attach them to the last lwb that's still pending * completion, or b) skip them altogether. * * This is used as a performance optimization to prevent commit itxs * from generating new lwbs when it's unnecessary to do so. */ static void zil_prune_commit_list(zilog_t *zilog) { itx_t *itx; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; if (lrc->lrc_txtype != TX_COMMIT) break; mutex_enter(&zilog->zl_lock); lwb_t *last_lwb = zilog->zl_last_lwb_opened; if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) { /* * All of the itxs this waiter was waiting on * must have already completed (or there were * never any itx's for it to wait on), so it's * safe to skip this waiter and mark it done. */ zil_commit_waiter_skip(itx->itx_private); } else { zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); } mutex_exit(&zilog->zl_lock); list_remove(&zilog->zl_itx_commit_list, itx); zil_itx_destroy(itx); } IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); } static void zil_commit_writer_stall(zilog_t *zilog) { /* * When zio_alloc_zil() fails to allocate the next lwb block on * disk, we must call txg_wait_synced() to ensure all of the * lwbs in the zilog's zl_lwb_list are synced and then freed (in * zil_sync()), such that any subsequent ZIL writer (i.e. a call * to zil_process_commit_list()) will have to call zil_create(), * and start a new ZIL chain. * * Since zil_alloc_zil() failed, the lwb that was previously * issued does not have a pointer to the "next" lwb on disk. * Thus, if another ZIL writer thread was to allocate the "next" * on-disk lwb, that block could be leaked in the event of a * crash (because the previous lwb on-disk would not point to * it). * * We must hold the zilog's zl_issuer_lock while we do this, to * ensure no new threads enter zil_process_commit_list() until * all lwb's in the zl_lwb_list have been synced and freed * (which is achieved via the txg_wait_synced() call). */ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); txg_wait_synced(zilog->zl_dmu_pool, 0); ASSERT(list_is_empty(&zilog->zl_lwb_list)); } static void zil_burst_done(zilog_t *zilog) { if (!list_is_empty(&zilog->zl_itx_commit_list) || zilog->zl_cur_used == 0) return; if (zilog->zl_parallel) zilog->zl_parallel--; zilog->zl_cur_used = 0; } /* * This function will traverse the commit list, creating new lwbs as * needed, and committing the itxs from the commit list to these newly * created lwbs. Additionally, as a new lwb is created, the previous * lwb will be issued to the zio layer to be written to disk. */ static void zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) { spa_t *spa = zilog->zl_spa; list_t nolwb_itxs; list_t nolwb_waiters; lwb_t *lwb, *plwb; itx_t *itx; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); /* * Return if there's nothing to commit before we dirty the fs by * calling zil_create(). */ if (list_is_empty(&zilog->zl_itx_commit_list)) return; list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) { lwb = zil_create(zilog); } else { /* * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will * have already been created (zl_lwb_list not empty). */ zil_commit_activate_saxattr_feature(zilog); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_OPENED); /* * If the lwb is still opened, it means the workload is really * multi-threaded and we won the chance of write aggregation. * If it is not opened yet, but previous lwb is still not * flushed, it still means the workload is multi-threaded, but * there was too much time between the commits to aggregate, so * we try aggregation next times, but without too much hopes. */ if (lwb->lwb_state == LWB_STATE_OPENED) { zilog->zl_parallel = ZIL_BURSTS; } else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) { zilog->zl_parallel = MAX(zilog->zl_parallel, ZIL_BURSTS / 2); } } while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; uint64_t txg = lrc->lrc_txg; ASSERT3U(txg, !=, 0); if (lrc->lrc_txtype == TX_COMMIT) { DTRACE_PROBE2(zil__process__commit__itx, zilog_t *, zilog, itx_t *, itx); } else { DTRACE_PROBE2(zil__process__normal__itx, zilog_t *, zilog, itx_t *, itx); } boolean_t synced = txg <= spa_last_synced_txg(spa); boolean_t frozen = txg > spa_freeze_txg(spa); /* * If the txg of this itx has already been synced out, then * we don't need to commit this itx to an lwb. This is * because the data of this itx will have already been * written to the main pool. This is inherently racy, and * it's still ok to commit an itx whose txg has already * been synced; this will result in a write that's * unnecessary, but will do no harm. * * With that said, we always want to commit TX_COMMIT itxs * to an lwb, regardless of whether or not that itx's txg * has been synced out. We do this to ensure any OPENED lwb * will always have at least one zil_commit_waiter_t linked * to the lwb. * * As a counter-example, if we skipped TX_COMMIT itx's * whose txg had already been synced, the following * situation could occur if we happened to be racing with * spa_sync: * * 1. We commit a non-TX_COMMIT itx to an lwb, where the * itx's txg is 10 and the last synced txg is 9. * 2. spa_sync finishes syncing out txg 10. * 3. We move to the next itx in the list, it's a TX_COMMIT * whose txg is 10, so we skip it rather than committing * it to the lwb used in (1). * * If the itx that is skipped in (3) is the last TX_COMMIT * itx in the commit list, than it's possible for the lwb * used in (1) to remain in the OPENED state indefinitely. * * To prevent the above scenario from occurring, ensuring * that once an lwb is OPENED it will transition to ISSUED * and eventually DONE, we always commit TX_COMMIT itx's to * an lwb here, even if that itx's txg has already been * synced. * * Finally, if the pool is frozen, we _always_ commit the * itx. The point of freezing the pool is to prevent data * from being written to the main pool via spa_sync, and * instead rely solely on the ZIL to persistently store the * data; i.e. when the pool is frozen, the last synced txg * value can't be trusted. */ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs); if (lwb == NULL) { list_insert_tail(&nolwb_itxs, itx); } else if ((zcw->zcw_lwb != NULL && zcw->zcw_lwb != lwb) || zcw->zcw_done) { /* * Our lwb is done, leave the rest of * itx list to somebody else who care. */ zilog->zl_parallel = ZIL_BURSTS; break; } } else { if (lrc->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } list_insert_tail(&nolwb_itxs, itx); } } else { ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } } if (lwb == NULL) { /* * This indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this happens, we must stall * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. */ while ((lwb = list_remove_head(ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); /* * Additionally, we have to signal and mark the "nolwb" * waiters as "done" here, since without an lwb, we * can't do this via zil_lwb_flush_vdevs_done() like * normal. */ zil_commit_waiter_t *zcw; while ((zcw = list_remove_head(&nolwb_waiters)) != NULL) zil_commit_waiter_skip(zcw); /* * And finally, we have to destroy the itx's that * couldn't be committed to an lwb; this will also call * the itx's callback if one exists for the itx. */ while ((itx = list_remove_head(&nolwb_itxs)) != NULL) zil_itx_destroy(itx); } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_OPENED); /* * At this point, the ZIL block pointed at by the "lwb" * variable is in "new" or "opened" state. * * If it's "new", then no itxs have been committed to it, so * there's no point in issuing its zio (i.e. it's "empty"). * * If it's "opened", then it contains one or more itxs that * eventually need to be committed to stable storage. In * this case we intentionally do not issue the lwb's zio * to disk yet, and instead rely on one of the following * two mechanisms for issuing the zio: * * 1. Ideally, there will be more ZIL activity occurring on * the system, such that this function will be immediately * called again by different thread and this lwb will be * closed by zil_lwb_assign(). This way, the lwb will be * "full" when it is issued to disk, and we'll make use of * the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on * the system, zil_commit_waiter() will close it and issue * the zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount * of ZIL activity occurring on the system at that time. * * We do this for a couple of reasons: * * 1. To try and reduce the number of IOPs needed to * write the same number of itxs. If an lwb has space * available in its buffer for more itxs, and more itxs * will be committed relatively soon (relative to the * latency of performing a write), then it's beneficial * to wait for these "next" itxs. This way, more itxs * can be committed to stable storage with fewer writes. * * 2. To try and use the largest lwb block size that the * incoming rate of itxs can support. Again, this is to * try and pack as many itxs into as few lwbs as * possible, without significantly impacting the latency * of each individual itx. */ if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) { list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); zil_burst_done(zilog); if (lwb == NULL) { while ((lwb = list_remove_head(ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); } } } } /* * This function is responsible for ensuring the passed in commit waiter * (and associated commit itx) is committed to an lwb. If the waiter is * not already committed to an lwb, all itxs in the zilog's queue of * itxs will be processed. The assumption is the passed in waiter's * commit itx will found in the queue just like the other non-commit * itxs, such that when the entire queue is processed, the waiter will * have been committed to an lwb. * * The lwb associated with the passed in waiter is not guaranteed to * have been issued by the time this function completes. If the lwb is * not issued, we rely on future calls to zil_commit_writer() to issue * the lwb, or the timeout mechanism found in zil_commit_waiter(). */ static uint64_t zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { list_t ilwbs; lwb_t *lwb; uint64_t wtxg = 0; ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node)); mutex_enter(&zilog->zl_issuer_lock); if (zcw->zcw_lwb != NULL || zcw->zcw_done) { /* * It's possible that, while we were waiting to acquire * the "zl_issuer_lock", another thread committed this * waiter to an lwb. If that occurs, we bail out early, * without processing any of the zilog's queue of itxs. * * On certain workloads and system configurations, the * "zl_issuer_lock" can become highly contended. In an * attempt to reduce this contention, we immediately drop * the lock if the waiter has already been processed. * * We've measured this optimization to reduce CPU spent * contending on this lock by up to 5%, using a system * with 32 CPUs, low latency storage (~50 usec writes), * and 1024 threads performing sync writes. */ goto out; } ZIL_STAT_BUMP(zilog, zil_commit_writer_count); wtxg = zil_get_commit_list(zilog); zil_prune_commit_list(zilog); zil_process_commit_list(zilog, zcw, &ilwbs); out: mutex_exit(&zilog->zl_issuer_lock); while ((lwb = list_remove_head(&ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); list_destroy(&ilwbs); return (wtxg); } static void zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); ASSERT3B(zcw->zcw_done, ==, B_FALSE); lwb_t *lwb = zcw->zcw_lwb; ASSERT3P(lwb, !=, NULL); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); /* * If the lwb has already been issued by another thread, we can * immediately return since there's no work to be done (the * point of this function is to issue the lwb). Additionally, we * do this prior to acquiring the zl_issuer_lock, to avoid * acquiring it when it's not necessary to do so. */ if (lwb->lwb_state != LWB_STATE_OPENED) return; /* * In order to call zil_lwb_write_close() we must hold the * zilog's "zl_issuer_lock". We can't simply acquire that lock, * since we're already holding the commit waiter's "zcw_lock", * and those two locks are acquired in the opposite order * elsewhere. */ mutex_exit(&zcw->zcw_lock); mutex_enter(&zilog->zl_issuer_lock); mutex_enter(&zcw->zcw_lock); /* * Since we just dropped and re-acquired the commit waiter's * lock, we have to re-check to see if the waiter was marked * "done" during that process. If the waiter was marked "done", * the "lwb" pointer is no longer valid (it can be free'd after * the waiter is marked "done"), so without this check we could * wind up with a use-after-free error below. */ if (zcw->zcw_done) { mutex_exit(&zilog->zl_issuer_lock); return; } ASSERT3P(lwb, ==, zcw->zcw_lwb); /* * We've already checked this above, but since we hadn't acquired * the zilog's zl_issuer_lock, we have to perform this check a * second time while holding the lock. * * We don't need to hold the zl_lock since the lwb cannot transition * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb * _can_ transition from CLOSED to DONE, but it's OK to race with * that transition since we treat the lwb the same, whether it's in * the CLOSED, ISSUED or DONE states. * * The important thing, is we treat the lwb differently depending on * if it's OPENED or CLOSED, and block any other threads that might * attempt to close/issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call * zil_lwb_write_close() if the lwb had already been closed/issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ if (lwb->lwb_state != LWB_STATE_OPENED) { mutex_exit(&zilog->zl_issuer_lock); return; } /* * We do not need zcw_lock once we hold zl_issuer_lock and know lwb * is still open. But we have to drop it to avoid a deadlock in case * callback of zio issued by zil_lwb_write_issue() try to get it, * while zil_lwb_write_issue() is blocked on attempt to issue next * lwb it found in LWB_STATE_READY state. */ mutex_exit(&zcw->zcw_lock); /* * As described in the comments above zil_commit_waiter() and * zil_process_commit_list(), we need to issue this lwb's zio * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); zil_burst_done(zilog); if (nlwb == NULL) { /* * When zil_lwb_write_close() returns NULL, this * indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the * zil_commit_writer_stall() function for more details. */ zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); mutex_exit(&zilog->zl_issuer_lock); } else { mutex_exit(&zilog->zl_issuer_lock); zil_lwb_write_issue(zilog, lwb); } mutex_enter(&zcw->zcw_lock); } /* * This function is responsible for performing the following two tasks: * * 1. its primary responsibility is to block until the given "commit * waiter" is considered "done". * * 2. its secondary responsibility is to issue the zio for the lwb that * the given "commit waiter" is waiting on, if this function has * waited "long enough" and the lwb is still in the "open" state. * * Given a sufficient amount of itxs being generated and written using * the ZIL, the lwb's zio will be issued via the zil_lwb_assign() * function. If this does not occur, this secondary responsibility will * ensure the lwb is issued even if there is not other synchronous * activity on the system. * * For more details, see zil_process_commit_list(); more specifically, * the comment at the bottom of that function. */ static void zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(spa_writeable(zilog->zl_spa)); mutex_enter(&zcw->zcw_lock); /* * The timeout is scaled based on the lwb latency to avoid * significantly impacting the latency of each individual itx. * For more details, see the comment at the bottom of the * zil_process_commit_list() function. */ int pct = MAX(zfs_commit_timeout_pct, 1); hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; hrtime_t wakeup = gethrtime() + sleep; boolean_t timedout = B_FALSE; while (!zcw->zcw_done) { ASSERT(MUTEX_HELD(&zcw->zcw_lock)); lwb_t *lwb = zcw->zcw_lwb; /* * Usually, the waiter will have a non-NULL lwb field here, * but it's possible for it to be NULL as a result of * zil_commit() racing with spa_sync(). * * When zil_clean() is called, it's possible for the itxg * list (which may be cleaned via a taskq) to contain * commit itxs. When this occurs, the commit waiters linked * off of these commit itxs will not be committed to an * lwb. Additionally, these commit waiters will not be * marked done until zil_commit_waiter_skip() is called via * zil_itxg_clean(). * * Thus, it's possible for this commit waiter (i.e. the * "zcw" variable) to be found in this "in between" state; * where it's "zcw_lwb" field is NULL, and it hasn't yet * been skipped, so it's "zcw_done" field is still B_FALSE. */ IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW); if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { ASSERT3B(timedout, ==, B_FALSE); /* * If the lwb hasn't been issued yet, then we * need to wait with a timeout, in case this * function needs to issue the lwb after the * timeout is reached; responsibility (2) from * the comment above this function. */ int rc = cv_timedwait_hires(&zcw->zcw_cv, &zcw->zcw_lock, wakeup, USEC2NSEC(1), CALLOUT_FLAG_ABSOLUTE); if (rc != -1 || zcw->zcw_done) continue; timedout = B_TRUE; zil_commit_waiter_timeout(zilog, zcw); if (!zcw->zcw_done) { /* * If the commit waiter has already been * marked "done", it's possible for the * waiter's lwb structure to have already * been freed. Thus, we can only reliably * make these assertions if the waiter * isn't done. */ ASSERT3P(lwb, ==, zcw->zcw_lwb); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); } } else { /* * If the lwb isn't open, then it must have already * been issued. In that case, there's no need to * use a timeout when waiting for the lwb to * complete. * * Additionally, if the lwb is NULL, the waiter * will soon be signaled and marked done via * zil_clean() and zil_itxg_clean(), so no timeout * is required. */ IMPLY(lwb != NULL, lwb->lwb_state == LWB_STATE_CLOSED || lwb->lwb_state == LWB_STATE_READY || lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE); cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); } } mutex_exit(&zcw->zcw_lock); } static zil_commit_waiter_t * zil_alloc_commit_waiter(void) { zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&zcw->zcw_node); zcw->zcw_lwb = NULL; zcw->zcw_done = B_FALSE; zcw->zcw_zio_error = 0; return (zcw); } static void zil_free_commit_waiter(zil_commit_waiter_t *zcw) { ASSERT(!list_link_active(&zcw->zcw_node)); ASSERT3P(zcw->zcw_lwb, ==, NULL); ASSERT3B(zcw->zcw_done, ==, B_TRUE); mutex_destroy(&zcw->zcw_lock); cv_destroy(&zcw->zcw_cv); kmem_cache_free(zil_zcw_cache, zcw); } /* * This function is used to create a TX_COMMIT itx and assign it. This * way, it will be linked into the ZIL's list of synchronous itxs, and * then later committed to an lwb (or skipped) when * zil_process_commit_list() is called. */ static void zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) { dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); /* * Since we are not going to create any new dirty data, and we * can even help with clearing the existing dirty data, we * should not be subject to the dirty data based delays. We * use TXG_NOTHROTTLE to bypass the delay mechanism. */ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); itx->itx_sync = B_TRUE; itx->itx_private = zcw; zil_itx_assign(zilog, itx, tx); dmu_tx_commit(tx); } /* * Commit ZFS Intent Log transactions (itxs) to stable storage. * * When writing ZIL transactions to the on-disk representation of the * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple * itxs can be committed to a single lwb. Once a lwb is written and * committed to stable storage (i.e. the lwb is written, and vdevs have * been flushed), each itx that was committed to that lwb is also * considered to be committed to stable storage. * * When an itx is committed to an lwb, the log record (lr_t) contained * by the itx is copied into the lwb's zio buffer, and once this buffer * is written to disk, it becomes an on-disk ZIL block. * * As itxs are generated, they're inserted into the ZIL's queue of * uncommitted itxs. The semantics of zil_commit() are such that it will * block until all itxs that were in the queue when it was called, are * committed to stable storage. * * If "foid" is zero, this means all "synchronous" and "asynchronous" * itxs, for all objects in the dataset, will be committed to stable * storage prior to zil_commit() returning. If "foid" is non-zero, all * "synchronous" itxs for all objects, but only "asynchronous" itxs * that correspond to the foid passed in, will be committed to stable * storage prior to zil_commit() returning. * * Generally speaking, when zil_commit() is called, the consumer doesn't * actually care about _all_ of the uncommitted itxs. Instead, they're * simply trying to waiting for a specific itx to be committed to disk, * but the interface(s) for interacting with the ZIL don't allow such * fine-grained communication. A better interface would allow a consumer * to create and assign an itx, and then pass a reference to this itx to * zil_commit(); such that zil_commit() would return as soon as that * specific itx was committed to disk (instead of waiting for _all_ * itxs to be committed). * * When a thread calls zil_commit() a special "commit itx" will be * generated, along with a corresponding "waiter" for this commit itx. * zil_commit() will wait on this waiter's CV, such that when the waiter * is marked done, and signaled, zil_commit() will return. * * This commit itx is inserted into the queue of uncommitted itxs. This * provides an easy mechanism for determining which itxs were in the * queue prior to zil_commit() having been called, and which itxs were * added after zil_commit() was called. * * The commit itx is special; it doesn't have any on-disk representation. * When a commit itx is "committed" to an lwb, the waiter associated * with it is linked onto the lwb's list of waiters. Then, when that lwb * completes, each waiter on the lwb's list is marked done and signaled * -- allowing the thread waiting on the waiter to return from zil_commit(). * * It's important to point out a few critical factors that allow us * to make use of the commit itxs, commit waiters, per-lwb lists of * commit waiters, and zio completion callbacks like we're doing: * * 1. The list of waiters for each lwb is traversed, and each commit * waiter is marked "done" and signaled, in the zio completion * callback of the lwb's zio[*]. * * * Actually, the waiters are signaled in the zio completion * callback of the root zio for the DKIOCFLUSHWRITECACHE commands * that are sent to the vdevs upon completion of the lwb zio. * * 2. When the itxs are inserted into the ZIL's queue of uncommitted * itxs, the order in which they are inserted is preserved[*]; as * itxs are added to the queue, they are added to the tail of * in-memory linked lists. * * When committing the itxs to lwbs (to be written to disk), they * are committed in the same order in which the itxs were added to * the uncommitted queue's linked list(s); i.e. the linked list of * itxs to commit is traversed from head to tail, and each itx is * committed to an lwb in that order. * * * To clarify: * * - the order of "sync" itxs is preserved w.r.t. other * "sync" itxs, regardless of the corresponding objects. * - the order of "async" itxs is preserved w.r.t. other * "async" itxs corresponding to the same object. * - the order of "async" itxs is *not* preserved w.r.t. other * "async" itxs corresponding to different objects. * - the order of "sync" itxs w.r.t. "async" itxs (or vice * versa) is *not* preserved, even for itxs that correspond * to the same object. * * For more details, see: zil_itx_assign(), zil_async_to_sync(), * zil_get_commit_list(), and zil_process_commit_list(). * * 3. The lwbs represent a linked list of blocks on disk. Thus, any * lwb cannot be considered committed to stable storage, until its * "previous" lwb is also committed to stable storage. This fact, * coupled with the fact described above, means that itxs are * committed in (roughly) the order in which they were generated. * This is essential because itxs are dependent on prior itxs. * Thus, we *must not* deem an itx as being committed to stable * storage, until *all* prior itxs have also been committed to * stable storage. * * To enforce this ordering of lwb zio's, while still leveraging as * much of the underlying storage performance as possible, we rely * on two fundamental concepts: * * 1. The creation and issuance of lwb zio's is protected by * the zilog's "zl_issuer_lock", which ensures only a single * thread is creating and/or issuing lwb's at a time * 2. The "previous" lwb is a child of the "current" lwb * (leveraging the zio parent-child dependency graph) * * By relying on this parent-child zio relationship, we can have * many lwb zio's concurrently issued to the underlying storage, * but the order in which they complete will be the same order in * which they were created. */ void zil_commit(zilog_t *zilog, uint64_t foid) { /* * We should never attempt to call zil_commit on a snapshot for * a couple of reasons: * * 1. A snapshot may never be modified, thus it cannot have any * in-flight itxs that would have modified the dataset. * * 2. By design, when zil_commit() is called, a commit itx will * be assigned to this zilog; as a result, the zilog will be * dirtied. We must not dirty the zilog of a snapshot; there's * checks in the code that enforce this invariant, and will * cause a panic if it's not upheld. */ ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); if (zilog->zl_sync == ZFS_SYNC_DISABLED) return; if (!spa_writeable(zilog->zl_spa)) { /* * If the SPA is not writable, there should never be any * pending itxs waiting to be committed to disk. If that * weren't true, we'd skip writing those itxs out, and * would break the semantics of zil_commit(); thus, we're * verifying that truth before we return to the caller. */ ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); for (int i = 0; i < TXG_SIZE; i++) ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); return; } /* * If the ZIL is suspended, we don't want to dirty it by calling * zil_commit_itx_assign() below, nor can we write out * lwbs like would be done in zil_commit_write(). Thus, we * simply rely on txg_wait_synced() to maintain the necessary * semantics, and avoid calling those functions altogether. */ if (zilog->zl_suspend > 0) { txg_wait_synced(zilog->zl_dmu_pool, 0); return; } zil_commit_impl(zilog, foid); } void zil_commit_impl(zilog_t *zilog, uint64_t foid) { ZIL_STAT_BUMP(zilog, zil_commit_count); /* * Move the "async" itxs for the specified foid to the "sync" * queues, such that they will be later committed (or skipped) * to an lwb when zil_process_commit_list() is called. * * Since these "async" itxs must be committed prior to this * call to zil_commit returning, we must perform this operation * before we call zil_commit_itx_assign(). */ zil_async_to_sync(zilog, foid); /* * We allocate a new "waiter" structure which will initially be * linked to the commit itx using the itx's "itx_private" field. * Since the commit itx doesn't represent any on-disk state, * when it's committed to an lwb, rather than copying the its * lr_t into the lwb's buffer, the commit itx's "waiter" will be * added to the lwb's list of waiters. Then, when the lwb is * committed to stable storage, each waiter in the lwb's list of * waiters will be marked "done", and signalled. * * We must create the waiter and assign the commit itx prior to * calling zil_commit_writer(), or else our specific commit itx * is not guaranteed to be committed to an lwb prior to calling * zil_commit_waiter(). */ zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); zil_commit_itx_assign(zilog, zcw); uint64_t wtxg = zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); if (zcw->zcw_zio_error != 0) { /* * If there was an error writing out the ZIL blocks that * this thread is waiting on, then we fallback to * relying on spa_sync() to write out the data this * thread is waiting on. Obviously this has performance * implications, but the expectation is for this to be * an exceptional case, and shouldn't occur often. */ DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); txg_wait_synced(zilog->zl_dmu_pool, 0); } else if (wtxg != 0) { txg_wait_synced(zilog->zl_dmu_pool, wtxg); } zil_free_commit_waiter(zcw); } /* * Called in syncing context to free committed log blocks and update log header. */ void zil_sync(zilog_t *zilog, dmu_tx_t *tx) { zil_header_t *zh = zil_header_in_syncing_context(zilog); uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = zilog->zl_spa; uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; lwb_t *lwb; /* * We don't zero out zl_destroy_txg, so make sure we don't try * to destroy it twice. */ if (spa_sync_pass(spa) != 1) return; zil_lwb_flush_wait_all(zilog, txg); mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); if (*replayed_seq != 0) { ASSERT(zh->zh_replay_seq < *replayed_seq); zh->zh_replay_seq = *replayed_seq; *replayed_seq = 0; } if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); ASSERT(list_is_empty(&zilog->zl_lwb_list)); memset(zh, 0, sizeof (zil_header_t)); memset(zilog->zl_replayed_seq, 0, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* * If this block was part of log chain that couldn't * be claimed because a device was missing during * zil_claim(), but that device later returns, * then this block could erroneously appear valid. * To guard against this, assign a new GUID to the new * log chain so it doesn't matter what blk points to. */ zil_init_log_chain(zilog, &blk); zh->zh_log = blk; } else { /* * A destroyed ZIL chain can't contain any TX_SETSAXATTR * records. So, deactivate the feature for this dataset. * We activate it again when we start a new ZIL chain. */ if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) dsl_dataset_deactivate_feature(ds, SPA_FEATURE_ZILSAXATTR, tx); } } while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_state != LWB_STATE_FLUSH_DONE || lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); if (!BP_IS_HOLE(&lwb->lwb_blk)) zio_free(spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); /* * If we don't have anything left in the lwb list then * we've had an allocation failure and we need to zero * out the zil_header blkptr so that we don't end * up freeing the same block twice. */ if (list_is_empty(&zilog->zl_lwb_list)) BP_ZERO(&zh->zh_log); } mutex_exit(&zilog->zl_lock); } static int zil_lwb_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; lwb_t *lwb = vbuf; list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } static void zil_lwb_dest(void *vbuf, void *unused) { (void) unused; lwb_t *lwb = vbuf; mutex_destroy(&lwb->lwb_vdev_lock); avl_destroy(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); list_destroy(&lwb->lwb_itxs); } void zil_init(void) { zil_lwb_cache = kmem_cache_create("zil_lwb_cache", sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); zil_zcw_cache = kmem_cache_create("zil_zcw_cache", sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zil_sums_init(&zil_sums_global); zil_kstats_global = kstat_create("zfs", 0, "zil", "misc", KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zil_kstats_global != NULL) { zil_kstats_global->ks_data = &zil_stats; zil_kstats_global->ks_update = zil_kstats_global_update; zil_kstats_global->ks_private = NULL; kstat_install(zil_kstats_global); } } void zil_fini(void) { kmem_cache_destroy(zil_zcw_cache); kmem_cache_destroy(zil_lwb_cache); if (zil_kstats_global != NULL) { kstat_delete(zil_kstats_global); zil_kstats_global = NULL; } zil_sums_fini(&zil_sums_global); } void zil_set_sync(zilog_t *zilog, uint64_t sync) { zilog->zl_sync = sync; } void zil_set_logbias(zilog_t *zilog, uint64_t logbias) { zilog->zl_logbias = logbias; } zilog_t * zil_alloc(objset_t *os, zil_header_t *zh_phys) { zilog_t *zilog; zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); zilog->zl_header = zh_phys; zilog->zl_os = os; zilog->zl_spa = dmu_objset_spa(os); zilog->zl_dmu_pool = dmu_objset_pool(os); zilog->zl_destroy_txg = TXG_INITIAL - 1; zilog->zl_logbias = dmu_objset_logbias(os); zilog->zl_sync = dmu_objset_syncprop(os); zilog->zl_dirty_max_txg = 0; zilog->zl_last_lwb_opened = NULL; zilog->zl_last_lwb_latency = 0; zilog->zl_max_block_size = zil_maxblocksize; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL); for (int i = 0; i < TXG_SIZE; i++) { mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, MUTEX_DEFAULT, NULL); } list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), offsetof(itx_t, itx_node)); cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL); return (zilog); } void zil_free(zilog_t *zilog) { int i; zilog->zl_stop_sync = 1; ASSERT0(zilog->zl_suspend); ASSERT0(zilog->zl_suspending); ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); list_destroy(&zilog->zl_itx_commit_list); for (i = 0; i < TXG_SIZE; i++) { /* * It's possible for an itx to be generated that doesn't dirty * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() * callback to remove the entry. We remove those here. * * Also free up the ziltest itxs. */ if (zilog->zl_itxg[i].itxg_itxs) zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); mutex_destroy(&zilog->zl_itxg[i].itxg_lock); } mutex_destroy(&zilog->zl_issuer_lock); mutex_destroy(&zilog->zl_lock); mutex_destroy(&zilog->zl_lwb_io_lock); cv_destroy(&zilog->zl_cv_suspend); cv_destroy(&zilog->zl_lwb_io_cv); kmem_free(zilog, sizeof (zilog_t)); } /* * Open an intent log. */ zilog_t * zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums) { zilog_t *zilog = dmu_objset_zil(os); ASSERT3P(zilog->zl_get_data, ==, NULL); ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; zilog->zl_sums = zil_sums; return (zilog); } /* * Close an intent log. */ void zil_close(zilog_t *zilog) { lwb_t *lwb; uint64_t txg; if (!dmu_objset_is_snapshot(zilog->zl_os)) { zil_commit(zilog, 0); } else { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT0(zilog->zl_dirty_max_txg); ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); } mutex_enter(&zilog->zl_lock); txg = zilog->zl_dirty_max_txg; lwb = list_tail(&zilog->zl_lwb_list); if (lwb != NULL) { txg = MAX(txg, lwb->lwb_alloc_txg); txg = MAX(txg, lwb->lwb_max_txg); } mutex_exit(&zilog->zl_lock); /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in * zil_lwb_write_issue(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); mutex_exit(&zilog->zl_lwb_io_lock); /* * We need to use txg_wait_synced() to wait until that txg is synced. * zil_sync() will guarantee all lwbs up to that txg have been * written out, flushed, and cleaned. */ if (txg != 0) txg_wait_synced(zilog->zl_dmu_pool, txg); if (zilog_is_dirty(zilog)) zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, (u_longlong_t)txg); if (txg < spa_freeze_txg(zilog->zl_spa)) VERIFY(!zilog_is_dirty(zilog)); zilog->zl_get_data = NULL; /* * We should have only one lwb left on the list; remove it now. */ mutex_enter(&zilog->zl_lock); lwb = list_remove_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zil_free_lwb(zilog, lwb); } mutex_exit(&zilog->zl_lock); } static const char *suspend_tag = "zil suspending"; /* * Suspend an intent log. While in suspended mode, we still honor * synchronous semantics, but we rely on txg_wait_synced() to do it. * On old version pools, we suspend the log briefly when taking a * snapshot so that it will have an empty intent log. * * Long holds are not really intended to be used the way we do here -- * held for such a short time. A concurrent caller of dsl_dataset_long_held() * could fail. Therefore we take pains to only put a long hold if it is * actually necessary. Fortunately, it will only be necessary if the * objset is currently mounted (or the ZVOL equivalent). In that case it * will already have a long hold, so we are not really making things any worse. * * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or * zvol_state_t), and use their mechanism to prevent their hold from being * dropped (e.g. VFS_HOLD()). However, that would be even more pain for * very little gain. * * if cookiep == NULL, this does both the suspend & resume. * Otherwise, it returns with the dataset "long held", and the cookie * should be passed into zil_resume(). */ int zil_suspend(const char *osname, void **cookiep) { objset_t *os; zilog_t *zilog; const zil_header_t *zh; int error; error = dmu_objset_hold(osname, suspend_tag, &os); if (error != 0) return (error); zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); zh = zilog->zl_header; if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (SET_ERROR(EBUSY)); } /* * Don't put a long hold in the cases where we can avoid it. This * is when there is no cookie so we are doing a suspend & resume * (i.e. called from zil_vdev_offline()), and there's nothing to do * for the suspend because it's already suspended, or there's no ZIL. */ if (cookiep == NULL && !zilog->zl_suspending && (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (0); } dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); dsl_pool_rele(dmu_objset_pool(os), suspend_tag); zilog->zl_suspend++; if (zilog->zl_suspend > 1) { /* * Someone else is already suspending it. * Just wait for them to finish. */ while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } /* * If there is no pointer to an on-disk block, this ZIL must not * be active (e.g. filesystem not mounted), so there's nothing * to clean up. */ if (BP_IS_HOLE(&zh->zh_log)) { ASSERT(cookiep != NULL); /* fast path already handled */ *cookiep = os; mutex_exit(&zilog->zl_lock); return (0); } /* * The ZIL has work to do. Ensure that the associated encryption * key will remain mapped while we are committing the log by * grabbing a reference to it. If the key isn't loaded we have no * choice but to return an error until the wrapping key is loaded. */ if (os->os_encrypted && dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) { zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); return (SET_ERROR(EACCES)); } zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); /* * We need to use zil_commit_impl to ensure we wait for all * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed * to disk before proceeding. If we used zil_commit instead, it * would just call txg_wait_synced(), because zl_suspend is set. * txg_wait_synced() doesn't wait for these lwb's to be * LWB_STATE_FLUSH_DONE before returning. */ zil_commit_impl(zilog, 0); /* * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we * use txg_wait_synced() to ensure the data from the zilog has * migrated to the main pool before calling zil_destroy(). */ txg_wait_synced(zilog->zl_dmu_pool, 0); zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); zilog->zl_suspending = B_FALSE; cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); if (os->os_encrypted) dsl_dataset_remove_key_mapping(dmu_objset_ds(os)); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } void zil_resume(void *cookie) { objset_t *os = cookie; zilog_t *zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_suspend != 0); zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); } typedef struct zil_replay_arg { zil_replay_func_t *const *zr_replay; void *zr_arg; boolean_t zr_byteswap; char *zr_lr; } zil_replay_arg_t; static int zil_replay_error(zilog_t *zilog, const lr_t *lr, int error) { char name[ZFS_MAX_DATASET_NAME_LEN]; zilog->zl_replaying_seq--; /* didn't actually replay this one */ dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)(lr->lrc_txtype & ~TX_CI), (lr->lrc_txtype & TX_CI) ? "CI" : ""); return (error); } static int zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; int error = 0; zilog->zl_replaying_seq = lr->lrc_seq; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return (0); if (lr->lrc_txg < claim_txg) /* already committed */ return (0); /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; if (txtype == 0 || txtype >= TX_MAX_TYPE) return (zil_replay_error(zilog, lr, EINVAL)); /* * If this record type can be logged out of order, the object * (lr_foid) may no longer exist. That's legitimate, not an error. */ if (TX_OOO(txtype)) { error = dmu_object_info(zilog->zl_os, LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL); if (error == ENOENT || error == EEXIST) return (0); } /* * Make a copy of the data so we can revise and extend it. */ memcpy(zr->zr_lr, lr, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. */ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { error = zil_read_log_data(zilog, (lr_write_t *)lr, zr->zr_lr + reclen); if (error != 0) return (zil_replay_error(zilog, lr, error)); } /* * The log block containing this lr may have been byteswapped * so that we can easily examine common fields like lrc_txtype. * However, the log is a mix of different record types, and only the * replay vectors know how to byteswap their records. Therefore, if * the lr was byteswapped, undo it before invoking the replay vector. */ if (zr->zr_byteswap) byteswap_uint64_array(zr->zr_lr, reclen); /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number * is updated if we are in replay mode. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); if (error != 0) { /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with * EEXIST. So if we receive any error we try syncing out * any removes then retry the transaction. Note that we * specify B_FALSE for byteswap now, so we don't do it twice. */ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); if (error != 0) return (zil_replay_error(zilog, lr, error)); } return (0); } static int zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) { (void) bp, (void) arg, (void) claim_txg; zilog->zl_replay_blks++; return (0); } /* * If this dataset has a non-empty intent log, replay it and destroy it. * Return B_TRUE if there were any entries to replay. */ boolean_t zil_replay(objset_t *os, void *arg, zil_replay_func_t *const replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { return (zil_destroy(zilog, B_TRUE)); } zr.zr_replay = replay_func; zr.zr_arg = arg; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* * Wait for in-progress removes to sync before starting replay. */ txg_wait_synced(zilog->zl_dmu_pool, 0); zilog->zl_replay = B_TRUE; zilog->zl_replay_time = ddi_get_lbolt(); ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg, B_TRUE); vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_replay = B_FALSE; return (B_TRUE); } boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx) { if (zilog->zl_sync == ZFS_SYNC_DISABLED) return (B_TRUE); if (zilog->zl_replay) { dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = zilog->zl_replaying_seq; return (B_TRUE); } return (B_FALSE); } int zil_reset(const char *osname, void *arg) { (void) arg; int error = zil_suspend(osname, NULL); /* EACCES means crypto key not loaded */ if ((error == EACCES) || (error == EBUSY)) return (SET_ERROR(error)); if (error != 0) return (SET_ERROR(EEXIST)); return (0); } EXPORT_SYMBOL(zil_alloc); EXPORT_SYMBOL(zil_free); EXPORT_SYMBOL(zil_open); EXPORT_SYMBOL(zil_close); EXPORT_SYMBOL(zil_replay); EXPORT_SYMBOL(zil_replaying); EXPORT_SYMBOL(zil_destroy); EXPORT_SYMBOL(zil_destroy_sync); EXPORT_SYMBOL(zil_itx_create); EXPORT_SYMBOL(zil_itx_destroy); EXPORT_SYMBOL(zil_itx_assign); EXPORT_SYMBOL(zil_commit); EXPORT_SYMBOL(zil_claim); EXPORT_SYMBOL(zil_check_log_chain); EXPORT_SYMBOL(zil_sync); EXPORT_SYMBOL(zil_clean); EXPORT_SYMBOL(zil_suspend); EXPORT_SYMBOL(zil_resume); EXPORT_SYMBOL(zil_lwb_add_block); EXPORT_SYMBOL(zil_bp_tree_add); EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); EXPORT_SYMBOL(zil_sums_init); EXPORT_SYMBOL(zil_sums_fini); EXPORT_SYMBOL(zil_kstat_values_update); ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW, "ZIL block open timeout percentage"); ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, "Disable intent logging replay"); ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW, "Disable ZIL cache flushes"); ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW, "Limit in bytes slog sync writes per commit"); ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW, "Limit in bytes of ZIL log block size"); ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW, "Limit in bytes WR_COPIED size"); diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c index 9de515e8767a..e511b31fee6d 100644 --- a/sys/contrib/openzfs/module/zfs/zio_checksum.c +++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c @@ -1,573 +1,577 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. */ #include #include #include #include #include #include #include #include /* * Checksum vectors. * * In the SPA, everything is checksummed. We support checksum vectors * for three distinct reasons: * * 1. Different kinds of data need different levels of protection. * For SPA metadata, we always want a very strong checksum. * For user data, we let users make the trade-off between speed * and checksum strength. * * 2. Cryptographic hash and MAC algorithms are an area of active research. * It is likely that in future hash functions will be at least as strong * as current best-of-breed, and may be substantially faster as well. * We want the ability to take advantage of these new hashes as soon as * they become available. * * 3. If someone develops hardware that can compute a strong hash quickly, * we want the ability to take advantage of that hardware. * * Of course, we don't want a checksum upgrade to invalidate existing * data, so we store the checksum *function* in eight bits of the bp. * This gives us room for up to 256 different checksum functions. * * When writing a block, we always checksum it with the latest-and-greatest * checksum function of the appropriate strength. When reading a block, * we compare the expected checksum against the actual checksum, which we * compute via the checksum function specified by BP_GET_CHECKSUM(bp). * * SALTED CHECKSUMS * * To enable the use of less secure hash algorithms with dedup, we * introduce the notion of salted checksums (MACs, really). A salted * checksum is fed both a random 256-bit value (the salt) and the data * to be checksummed. This salt is kept secret (stored on the pool, but * never shown to the user). Thus even if an attacker knew of collision * weaknesses in the hash algorithm, they won't be able to mount a known * plaintext attack on the DDT, since the actual hash value cannot be * known ahead of time. How the salt is used is algorithm-specific * (some might simply prefix it to the data block, others might need to * utilize a full-blown HMAC). On disk the salt is stored in a ZAP * object in the MOS (DMU_POOL_CHECKSUM_SALT). * * CONTEXT TEMPLATES * * Some hashing algorithms need to perform a substantial amount of * initialization work (e.g. salted checksums above may need to pre-hash * the salt) before being able to process data. Performing this * redundant work for each block would be wasteful, so we instead allow * a checksum algorithm to do the work once (the first time it's used) * and then keep this pre-initialized context as a template inside the * spa_t (spa_cksum_tmpls). If the zio_checksum_info_t contains * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to * construct and destruct the pre-initialized checksum context. The * pre-initialized context is then reused during each checksum * invocation and passed to the checksum function. */ static void abd_checksum_off(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) abd, (void) size, (void) ctx_template; ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } static void abd_fletcher_2_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; fletcher_init(zcp); (void) abd_iterate_func(abd, 0, size, fletcher_2_incremental_native, zcp); } static void abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; fletcher_init(zcp); (void) abd_iterate_func(abd, 0, size, fletcher_2_incremental_byteswap, zcp); } static inline void abd_fletcher_4_impl(abd_t *abd, uint64_t size, zio_abd_checksum_data_t *acdp) { fletcher_4_abd_ops.acf_init(acdp); abd_iterate_func(abd, 0, size, fletcher_4_abd_ops.acf_iter, acdp); fletcher_4_abd_ops.acf_fini(acdp); } void abd_fletcher_4_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; fletcher_4_ctx_t ctx; zio_abd_checksum_data_t acd = { .acd_byteorder = ZIO_CHECKSUM_NATIVE, .acd_zcp = zcp, .acd_ctx = &ctx }; abd_fletcher_4_impl(abd, size, &acd); } void abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; fletcher_4_ctx_t ctx; zio_abd_checksum_data_t acd = { .acd_byteorder = ZIO_CHECKSUM_BYTESWAP, .acd_zcp = zcp, .acd_ctx = &ctx }; abd_fletcher_4_impl(abd, size, &acd); } zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "inherit"}, {{NULL, NULL}, NULL, NULL, 0, "on"}, {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "off"}, {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, NULL, NULL, 0, "fletcher2"}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "noparity"}, {{abd_checksum_sha512_native, abd_checksum_sha512_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap}, abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, {{abd_checksum_blake3_native, abd_checksum_blake3_byteswap}, abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}, }; /* * The flag corresponding to the "verify" in dedup=[checksum,]verify * must be cleared first, so callers should use ZIO_CHECKSUM_MASK. */ spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum) { VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0); switch (cksum) { case ZIO_CHECKSUM_BLAKE3: return (SPA_FEATURE_BLAKE3); case ZIO_CHECKSUM_SHA512: return (SPA_FEATURE_SHA512); case ZIO_CHECKSUM_SKEIN: return (SPA_FEATURE_SKEIN); case ZIO_CHECKSUM_EDONR: return (SPA_FEATURE_EDONR); default: return (SPA_FEATURE_NONE); } } enum zio_checksum zio_checksum_select(enum zio_checksum child, enum zio_checksum parent) { ASSERT(child < ZIO_CHECKSUM_FUNCTIONS); ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS); ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); if (child == ZIO_CHECKSUM_INHERIT) return (parent); if (child == ZIO_CHECKSUM_ON) return (ZIO_CHECKSUM_ON_VALUE); return (child); } enum zio_checksum zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, enum zio_checksum parent) { ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); if (child == ZIO_CHECKSUM_INHERIT) return (parent); if (child == ZIO_CHECKSUM_ON) return (spa_dedup_checksum(spa)); if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY)) return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY); ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags & ZCHECKSUM_FLAG_DEDUP) || (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF); return (child); } /* * Set the external verifier for a gang block based on , * a tuple which is guaranteed to be unique for the life of the pool. */ static void zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t txg = BP_PHYSICAL_BIRTH(bp); ASSERT(BP_IS_GANG(bp)); ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); } /* * Set the external verifier for a label block based on its offset. * The vdev is implicit, and the txg is unknowable at pool open time -- * hence the logic in vdev_uberblock_load() to find the most recent copy. */ static void zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) { ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); } /* * Calls the template init function of a checksum which supports context * templates and installs the template into the spa_t. */ static void zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; if (ci->ci_tmpl_init == NULL) return; if (spa->spa_cksum_tmpls[checksum] != NULL) return; VERIFY(ci->ci_tmpl_free != NULL); mutex_enter(&spa->spa_cksum_tmpls_lock); if (spa->spa_cksum_tmpls[checksum] == NULL) { spa->spa_cksum_tmpls[checksum] = ci->ci_tmpl_init(&spa->spa_cksum_salt); VERIFY(spa->spa_cksum_tmpls[checksum] != NULL); } mutex_exit(&spa->spa_cksum_tmpls_lock); } /* convenience function to update a checksum to accommodate an encryption MAC */ static void zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor) { /* * Weak checksums do not have their entropy spread evenly * across the bits of the checksum. Therefore, when truncating * a weak checksum we XOR the first 2 words with the last 2 so * that we don't "lose" any entropy unnecessarily. */ if (xor) { cksum->zc_word[0] ^= cksum->zc_word[2]; cksum->zc_word[1] ^= cksum->zc_word[3]; } cksum->zc_word[2] = saved->zc_word[2]; cksum->zc_word[3] = saved->zc_word[3]; } /* * Generate the checksum. */ void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, abd_t *abd, uint64_t size) { static const uint64_t zec_magic = ZEC_MAGIC; blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t cksum, saved; spa_t *spa = zio->io_spa; boolean_t insecure = (ci->ci_flags & ZCHECKSUM_FLAG_DEDUP) == 0; ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS); ASSERT(ci->ci_func[0] != NULL); zio_checksum_template_init(checksum, spa); if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t eck; size_t eck_offset; memset(&saved, 0, sizeof (zio_cksum_t)); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t zilc; abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t)); - size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ, - uint64_t); + uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused, + ZIL_MIN_BLKSZ, uint64_t); + ASSERT3U(size, >=, nused); + size = nused; eck = zilc.zc_eck; eck_offset = offsetof(zil_chain_t, zc_eck); } else { + ASSERT3U(size, >=, sizeof (zio_eck_t)); eck_offset = size - sizeof (zio_eck_t); abd_copy_to_buf_off(&eck, abd, eck_offset, sizeof (zio_eck_t)); } if (checksum == ZIO_CHECKSUM_GANG_HEADER) { zio_checksum_gang_verifier(&eck.zec_cksum, bp); } else if (checksum == ZIO_CHECKSUM_LABEL) { zio_checksum_label_verifier(&eck.zec_cksum, offset); } else { saved = eck.zec_cksum; eck.zec_cksum = bp->blk_cksum; } abd_copy_from_buf_off(abd, &zec_magic, eck_offset + offsetof(zio_eck_t, zec_magic), sizeof (zec_magic)); abd_copy_from_buf_off(abd, &eck.zec_cksum, eck_offset + offsetof(zio_eck_t, zec_cksum), sizeof (zio_cksum_t)); ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &cksum); if (bp != NULL && BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET) zio_checksum_handle_crypt(&cksum, &saved, insecure); abd_copy_from_buf_off(abd, &cksum, eck_offset + offsetof(zio_eck_t, zec_cksum), sizeof (zio_cksum_t)); } else { saved = bp->blk_cksum; ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &cksum); if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET) zio_checksum_handle_crypt(&cksum, &saved, insecure); bp->blk_cksum = cksum; } } int zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum; zio_eck_t eck; int byteswap; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); zio_checksum_template_init(checksum, spa); IMPLY(bp == NULL, ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED); IMPLY(bp == NULL, checksum == ZIO_CHECKSUM_LABEL); if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_cksum_t verifier; size_t eck_offset; if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t zilc; uint64_t nused; abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t)); eck = zilc.zc_eck; eck_offset = offsetof(zil_chain_t, zc_eck) + offsetof(zio_eck_t, zec_cksum); if (eck.zec_magic == ZEC_MAGIC) { nused = zilc.zc_nused; } else if (eck.zec_magic == BSWAP_64(ZEC_MAGIC)) { nused = BSWAP_64(zilc.zc_nused); } else { return (SET_ERROR(ECKSUM)); } - if (nused > size) { + nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + if (size < nused) return (SET_ERROR(ECKSUM)); - } - - size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + size = nused; } else { + if (size < sizeof (zio_eck_t)) + return (SET_ERROR(ECKSUM)); eck_offset = size - sizeof (zio_eck_t); abd_copy_to_buf_off(&eck, abd, eck_offset, sizeof (zio_eck_t)); eck_offset += offsetof(zio_eck_t, zec_cksum); } if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&verifier, bp); else if (checksum == ZIO_CHECKSUM_LABEL) zio_checksum_label_verifier(&verifier, offset); else verifier = bp->blk_cksum; byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC)); if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); expected_cksum = eck.zec_cksum; abd_copy_from_buf_off(abd, &verifier, eck_offset, sizeof (zio_cksum_t)); ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); abd_copy_from_buf_off(abd, &expected_cksum, eck_offset, sizeof (zio_cksum_t)); if (byteswap) { byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); } } else { byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); } /* * MAC checksums are a special case since half of this checksum will * actually be the encryption MAC. This will be verified by the * decryption process, so we just check the truncated checksum now. * Objset blocks use embedded MACs so we don't truncate the checksum * for them. */ if (bp != NULL && BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET) { if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) { actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2]; actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3]; } actual_cksum.zc_word[2] = 0; actual_cksum.zc_word[3] = 0; expected_cksum.zc_word[2] = 0; expected_cksum.zc_word[3] = 0; } if (info != NULL) { info->zbc_checksum_name = ci->ci_name; info->zbc_byteswapped = byteswap; info->zbc_injected = 0; info->zbc_has_cksum = 1; } if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (SET_ERROR(ECKSUM)); return (0); } int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) { blkptr_t *bp = zio->io_bp; uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); int error; uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; error = zio_checksum_error_impl(spa, bp, checksum, data, size, offset, info); if (zio_injection_enabled && error == 0 && zio->io_error == 0) { error = zio_handle_fault_injection(zio, ECKSUM); if (error != 0) info->zbc_injected = 1; } return (error); } /* * Called by a spa_t that's about to be deallocated. This steps through * all of the checksum context templates and deallocates any that were * initialized using the algorithm-specific template init function. */ void zio_checksum_templates_free(spa_t *spa) { for (enum zio_checksum checksum = 0; checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { if (spa->spa_cksum_tmpls[checksum] != NULL) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; VERIFY(ci->ci_tmpl_free != NULL); ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); spa->spa_cksum_tmpls[checksum] = NULL; } } } diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index ac373379e43f..ce5b75462349 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -1,1748 +1,1756 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * * /dev// * * Volumes are persistent through reboot and module load. No user command * needs to be run before opening and using a device. * * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ /* * Note on locking of zvol state structures. * * These structures are used to maintain internal state used to emulate block * devices on top of zvols. In particular, management of device minor number * operations - create, remove, rename, and set_snapdev - involves access to * these structures. The zvol_state_lock is primarily used to protect the * zvol_state_list. The zv->zv_state_lock is used to protect the contents * of the zvol_state_t structures, as well as to make sure that when the * time comes to remove the structure from the list, it is not in use, and * therefore, it can be taken off zvol_state_list and freed. * * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol, * e.g. for the duration of receive and rollback operations. This lock can be * held for significant periods of time. Given that it is undesirable to hold * mutexes for long periods of time, the following lock ordering applies: * - take zvol_state_lock if necessary, to protect zvol_state_list * - take zv_suspend_lock if necessary, by the code path in question * - take zv_state_lock to protect zvol_state_t * * The minor operations are issued to spa->spa_zvol_taskq queues, that are * single-threaded (to preserve order of minor operations), and are executed * through the zvol_task_cb that dispatches the specific operations. Therefore, * these operations are serialized per pool. Consequently, we can be certain * that for a given zvol, there is only one operation at a time in progress. * That is why one can be sure that first, zvol_state_t for a given zvol is * allocated and placed on zvol_state_list, and then other minor operations * for this zvol are going to proceed in the order of issue. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include unsigned int zvol_inhibit_dev = 0; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; struct hlist_head *zvol_htable; static list_t zvol_state_list; krwlock_t zvol_state_lock; typedef enum { ZVOL_ASYNC_REMOVE_MINORS, ZVOL_ASYNC_RENAME_MINORS, ZVOL_ASYNC_SET_SNAPDEV, ZVOL_ASYNC_SET_VOLMODE, ZVOL_ASYNC_MAX } zvol_async_op_t; typedef struct { zvol_async_op_t op; char name1[MAXNAMELEN]; char name2[MAXNAMELEN]; uint64_t value; } zvol_task_t; uint64_t zvol_name_hash(const char *name) { uint64_t crc = -1ULL; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (const uint8_t *p = (const uint8_t *)name; *p != 0; p++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; return (crc); } /* * Find a zvol_state_t given the name and hash generated by zvol_name_hash. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ zvol_state_t * zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) { zvol_state_t *zv; struct hlist_node *p = NULL; rw_enter(&zvol_state_lock, RW_READER); hlist_for_each(p, ZVOL_HT_HEAD(hash)) { zv = hlist_entry(p, zvol_state_t, zv_hlink); mutex_enter(&zv->zv_state_lock); if (zv->zv_hash == hash && strcmp(zv->zv_name, name) == 0) { /* * this is the right zvol, take the locks in the * right order */ if (mode != RW_NONE && !rw_tryenter(&zv->zv_suspend_lock, mode)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, mode); mutex_enter(&zv->zv_state_lock); /* * zvol cannot be renamed as we continue * to hold zvol_state_lock */ ASSERT(zv->zv_hash == hash && strcmp(zv->zv_name, name) == 0); } rw_exit(&zvol_state_lock); return (zv); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (NULL); } /* * Find a zvol_state_t given the name. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ static zvol_state_t * zvol_find_by_name(const char *name, int mode) { return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode)); } /* * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. */ void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; nvlist_t *nvprops = zct->zct_props; int error; uint64_t volblocksize, volsize; VERIFY(nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); /* * These properties must be removed from the list so the generic * property setting step won't apply to them. */ VERIFY(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); (void) nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); ASSERT(error == 0); } /* * ZFS_IOC_OBJSET_STATS entry point. */ int zvol_get_stats(objset_t *os, nvlist_t *nv) { int error; dmu_object_info_t *doi; uint64_t val; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); if (error) return (SET_ERROR(error)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_object_info(os, ZVOL_OBJ, doi); if (error == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, doi->doi_data_block_size); } kmem_free(doi, sizeof (dmu_object_info_t)); return (SET_ERROR(error)); } /* * Sanity check volume size. */ int zvol_check_volsize(uint64_t volsize, uint64_t blocksize) { if (volsize == 0) return (SET_ERROR(EINVAL)); if (volsize % blocksize != 0) return (SET_ERROR(EINVAL)); #ifdef _ILP32 if (volsize - 1 > SPEC_MAXOFFSET_T) return (SET_ERROR(EOVERFLOW)); #endif return (0); } /* * Ensure the zap is flushed then inform the VFS of the capacity change. */ static int zvol_update_volsize(uint64_t volsize, objset_t *os) { dmu_tx_t *tx; int error; uint64_t txg; tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (SET_ERROR(error)); } txg = dmu_tx_get_txg(tx); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); if (error == 0) error = dmu_free_long_range(os, ZVOL_OBJ, volsize, DMU_OBJECT_END); return (error); } /* * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume * size will result in a udev "change" event being generated. */ int zvol_set_volsize(const char *name, uint64_t volsize) { objset_t *os = NULL; uint64_t readonly; int error; boolean_t owned = B_FALSE; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); if (error != 0) return (SET_ERROR(error)); if (readonly) return (SET_ERROR(EROFS)); zvol_state_t *zv = zvol_find_by_name(name, RW_READER); ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) && RW_READ_HELD(&zv->zv_suspend_lock))); if (zv == NULL || zv->zv_objset == NULL) { if (zv != NULL) rw_exit(&zv->zv_suspend_lock); if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE, FTAG, &os)) != 0) { if (zv != NULL) mutex_exit(&zv->zv_state_lock); return (SET_ERROR(error)); } owned = B_TRUE; if (zv != NULL) zv->zv_objset = os; } else { os = zv->zv_objset; } dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP); if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) || (error = zvol_check_volsize(volsize, doi->doi_data_block_size))) goto out; error = zvol_update_volsize(volsize, os); if (error == 0 && zv != NULL) { zv->zv_volsize = volsize; zv->zv_changed = 1; } out: kmem_free(doi, sizeof (dmu_object_info_t)); if (owned) { dmu_objset_disown(os, B_TRUE, FTAG); if (zv != NULL) zv->zv_objset = NULL; } else { rw_exit(&zv->zv_suspend_lock); } if (zv != NULL) mutex_exit(&zv->zv_state_lock); if (error == 0 && zv != NULL) zvol_os_update_volsize(zv, volsize); return (SET_ERROR(error)); } /* * Update volthreading. */ int zvol_set_volthreading(const char *name, boolean_t value) { zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL) return (ENOENT); zv->zv_threading = value; mutex_exit(&zv->zv_state_lock); return (0); } /* * Update zvol ro property. */ int zvol_set_ro(const char *name, boolean_t value) { zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL) return (-1); if (value) { zvol_os_set_disk_ro(zv, 1); zv->zv_flags |= ZVOL_RDONLY; } else { zvol_os_set_disk_ro(zv, 0); zv->zv_flags &= ~ZVOL_RDONLY; } mutex_exit(&zv->zv_state_lock); return (0); } /* * Sanity check volume block size. */ int zvol_check_volblocksize(const char *name, uint64_t volblocksize) { /* Record sizes above 128k need the feature to be enabled */ if (volblocksize > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (volblocksize > zfs_max_recordsize) return (SET_ERROR(EDOM)); spa_close(spa, FTAG); } if (volblocksize < SPA_MINBLOCKSIZE || volblocksize > SPA_MAXBLOCKSIZE || !ISP2(volblocksize)) return (SET_ERROR(EDOM)); return (0); } /* * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we * implement DKIOCFREE/free-long-range. */ static int zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_truncate_t *lr = arg2; uint64_t offset, length; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); int error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { (void) zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length); } return (error); } /* * Replay a TX_WRITE ZIL transaction that didn't get committed * after a system failure */ static int zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_write_t *lr = arg2; objset_t *os = zv->zv_objset; char *data = (char *)(lr + 1); /* data follows lr_write_t */ uint64_t offset, length; dmu_tx_t *tx; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, offset, length, data, tx); (void) zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); } return (error); } /* * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed * after a system failure. * * TODO: For now we drop block cloning transations for ZVOLs as they are * unsupported, but we still need to inform BRT about that as we * claimed them during pool import. * This situation can occur when we try to import a pool from a ZFS * version supporting block cloning for ZVOLs into a system that * has this ZFS version, that doesn't support block cloning for ZVOLs. */ static int zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) { char name[ZFS_MAX_DATASET_NAME_LEN]; zvol_state_t *zv = arg1; objset_t *os = zv->zv_objset; lr_clone_range_t *lr = arg2; blkptr_t *bp; dmu_tx_t *tx; spa_t *spa; uint_t ii; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + dmu_objset_name(os, name); cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.", name); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); tx = dmu_tx_create(os); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } spa = os->os_spa; for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; if (!BP_IS_HOLE(bp)) { zio_free(spa, dmu_tx_get_txg(tx), bp); } } (void) zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); return (0); } static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { (void) arg1, (void) arg2, (void) byteswap; return (SET_ERROR(ENOTSUP)); } /* * Callback vectors for replaying records. * Only TX_WRITE and TX_TRUNCATE are needed for zvol. */ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* no such transaction type */ zvol_replay_err, /* TX_CREATE */ zvol_replay_err, /* TX_MKDIR */ zvol_replay_err, /* TX_MKXATTR */ zvol_replay_err, /* TX_SYMLINK */ zvol_replay_err, /* TX_REMOVE */ zvol_replay_err, /* TX_RMDIR */ zvol_replay_err, /* TX_LINK */ zvol_replay_err, /* TX_RENAME */ zvol_replay_write, /* TX_WRITE */ zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ zvol_replay_err, /* TX_ACL */ zvol_replay_err, /* TX_CREATE_ATTR */ zvol_replay_err, /* TX_CREATE_ACL_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL */ zvol_replay_err, /* TX_MKDIR_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ zvol_replay_clone_range /* TX_CLONE_RANGE */ }; /* * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. * * We store data in the log buffers if it's small enough. * Otherwise we will later flush the data out via dmu_sync(). */ static const ssize_t zvol_immediate_write_sz = 32768; void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, boolean_t commit) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; uint64_t sz = size; if (zil_replaying(zilog, tx)) return; if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) write_state = WR_INDIRECT; else if (!spa_has_slogs(zilog->zl_spa) && size >= blocksize && blocksize > zvol_immediate_write_sz) write_state = WR_INDIRECT; else if (commit) write_state = WR_COPIED; else write_state = WR_NEED_COPY; while (size) { itx_t *itx; lr_write_t *lr; itx_wr_state_t wr_state = write_state; ssize_t len = size; if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog)) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(offset, blocksize), size); itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } itx->itx_wr_state = wr_state; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = offset; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zv; (void) zil_itx_assign(zilog, itx, tx); offset += len; size -= len; } if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg); } } /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */ void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len) { itx_t *itx; lr_truncate_t *lr; zilog_t *zilog = zv->zv_zilog; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = len; zil_itx_assign(zilog, itx, tx); } static void zvol_get_done(zgd_t *zgd, int error) { (void) error; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); kmem_free(zgd, sizeof (zgd_t)); } /* * Get data to generate a TX_WRITE intent log record. */ int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3U(size, !=, 0); zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ ASSERT3P(zio, !=, NULL); /* * Have to lock the whole block to ensure when it's written out * and its checksum is being calculated that no one can change * the data. Contrarily to zfs_get_data we need not re-check * blocksize after we get the lock because it cannot be changed. */ size = zv->zv_volblocksize; offset = P2ALIGN_TYPED(offset, size, uint64_t); zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd, &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db != NULL); ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zvol_get_done, zgd); if (error == 0) return (0); } } zvol_get_done(zgd, error); return (SET_ERROR(error)); } /* * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. */ void zvol_insert(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); list_insert_head(&zvol_state_list, zv); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); } /* * Simply remove the zvol from to list of zvols. */ static void zvol_remove(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); list_remove(&zvol_state_list, zv); hlist_del(&zv->zv_hlink); } /* * Setup zv after we just own the zv->objset */ static int zvol_setup_zv(zvol_state_t *zv) { uint64_t volsize; int error; uint64_t ro; objset_t *os = zv->zv_objset; ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock)); zv->zv_zilog = NULL; zv->zv_flags &= ~ZVOL_WRITTEN_TO; error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) return (SET_ERROR(error)); error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) return (SET_ERROR(error)); error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); if (error) return (SET_ERROR(error)); zvol_os_set_capacity(zv, volsize >> 9); zv->zv_volsize = volsize; if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { zvol_os_set_disk_ro(zv, 1); zv->zv_flags |= ZVOL_RDONLY; } else { zvol_os_set_disk_ro(zv, 0); zv->zv_flags &= ~ZVOL_RDONLY; } return (0); } /* * Shutdown every zv_objset related stuff except zv_objset itself. * The is the reverse of zvol_setup_zv. */ static void zvol_shutdown_zv(zvol_state_t *zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock) && RW_LOCK_HELD(&zv->zv_suspend_lock)); if (zv->zv_flags & ZVOL_WRITTEN_TO) { ASSERT(zv->zv_zilog != NULL); zil_close(zv->zv_zilog); } zv->zv_zilog = NULL; dnode_rele(zv->zv_dn, zv); zv->zv_dn = NULL; /* * Evict cached data. We must write out any dirty data before * disowning the dataset. */ if (zv->zv_flags & ZVOL_WRITTEN_TO) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); (void) dmu_objset_evict_dbufs(zv->zv_objset); } /* * return the proper tag for rollback and recv */ void * zvol_tag(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); return (zv->zv_open_count > 0 ? zv : NULL); } /* * Suspend the zvol for recv and rollback. */ zvol_state_t * zvol_suspend(const char *name) { zvol_state_t *zv; zv = zvol_find_by_name(name, RW_WRITER); if (zv == NULL) return (NULL); /* block all I/O, release in zvol_resume. */ ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); atomic_inc(&zv->zv_suspend_ref); if (zv->zv_open_count > 0) zvol_shutdown_zv(zv); /* * do not hold zv_state_lock across suspend/resume to * avoid locking up zvol lookups */ mutex_exit(&zv->zv_state_lock); /* zv_suspend_lock is released in zvol_resume() */ return (zv); } int zvol_resume(zvol_state_t *zv) { int error = 0; ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); mutex_enter(&zv->zv_state_lock); if (zv->zv_open_count > 0) { VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset)); dmu_objset_rele(zv->zv_objset, zv); error = zvol_setup_zv(zv); } mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); /* * We need this because we don't hold zvol_state_lock while releasing * zv_suspend_lock. zvol_remove_minors_impl thus cannot check * zv_suspend_lock to determine it is safe to free because rwlock is * not inherent atomic. */ atomic_dec(&zv->zv_suspend_ref); return (SET_ERROR(error)); } int zvol_first_open(zvol_state_t *zv, boolean_t readonly) { objset_t *os; int error; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(mutex_owned(&spa_namespace_lock)); boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os); if (error) return (SET_ERROR(error)); zv->zv_objset = os; error = zvol_setup_zv(zv); if (error) { dmu_objset_disown(os, 1, zv); zv->zv_objset = NULL; } return (error); } void zvol_last_close(zvol_state_t *zv) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zvol_shutdown_zv(zv); dmu_objset_disown(zv->zv_objset, 1, zv); zv->zv_objset = NULL; } typedef struct minors_job { list_t *list; list_node_t link; /* input */ char *name; /* output */ int error; } minors_job_t; /* * Prefetch zvol dnodes for the minors_job */ static void zvol_prefetch_minors_impl(void *arg) { minors_job_t *job = arg; char *dsname = job->name; objset_t *os = NULL; job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (job->error == 0) { dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ); dmu_objset_disown(os, B_TRUE, FTAG); } } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_snap_minor_cb(const char *dsname, void *arg) { minors_job_t *j = arg; list_t *minors_list = j->list; const char *name = j->name; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); /* skip the designated dataset */ if (name && strcmp(dsname, name) == 0) return (0); /* at this point, the dsname should name a snapshot */ if (strchr(dsname, '@') == 0) { dprintf("zvol_create_snap_minor_cb(): " "%s is not a snapshot name\n", dsname); } else { minors_job_t *job; char *n = kmem_strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); } return (0); } /* * If spa_keystore_load_wkey() is called for an encrypted zvol, * we need to look for any clones also using the key. This function * is "best effort" - so we just skip over it if there are failures. */ static void zvol_add_clones(const char *dsname, list_t *minors_list) { /* Also check if it has clones */ dsl_dir_t *dd = NULL; dsl_pool_t *dp = NULL; if (dsl_pool_hold(dsname, FTAG, &dp) != 0) return; if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) goto out; if (dsl_dir_hold(dp, dsname, FTAG, &dd, NULL) != 0) goto out; if (dsl_dir_phys(dd)->dd_clones == 0) goto out; zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); objset_t *mos = dd->dd_pool->dp_meta_objset; for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dataset_t *clone; minors_job_t *job; if (dsl_dataset_hold_obj(dd->dd_pool, za->za_first_integer, FTAG, &clone) == 0) { char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(clone, name); char *n = kmem_strdup(name); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); dsl_dataset_rele(clone, FTAG); } } zap_cursor_fini(zc); kmem_free(za, sizeof (zap_attribute_t)); kmem_free(zc, sizeof (zap_cursor_t)); out: if (dd != NULL) dsl_dir_rele(dd, FTAG); dsl_pool_rele(dp, FTAG); } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_minors_cb(const char *dsname, void *arg) { uint64_t snapdev; int error; list_t *minors_list = arg; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); if (error) return (0); /* * Given the name and the 'snapdev' property, create device minor nodes * with the linkages to zvols/snapshots as needed. * If the name represents a zvol, create a minor node for the zvol, then * check if its snapshots are 'visible', and if so, iterate over the * snapshots and create device minor nodes for those. */ if (strchr(dsname, '@') == 0) { minors_job_t *job; char *n = kmem_strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); zvol_add_clones(dsname, minors_list); if (snapdev == ZFS_SNAPDEV_VISIBLE) { /* * traverse snapshots only, do not traverse children, * and skip the 'dsname' */ (void) dmu_objset_find(dsname, zvol_create_snap_minor_cb, (void *)job, DS_FIND_SNAPSHOTS); } } else { dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", dsname); } return (0); } /* * Create minors for the specified dataset, including children and snapshots. * Pay attention to the 'snapdev' property and iterate over the snapshots * only if they are 'visible'. This approach allows one to assure that the * snapshot metadata is read from disk only if it is needed. * * The name can represent a dataset to be recursively scanned for zvols and * their snapshots, or a single zvol snapshot. If the name represents a * dataset, the scan is performed in two nested stages: * - scan the dataset for zvols, and * - for each zvol, create a minor node, then check if the zvol's snapshots * are 'visible', and only then iterate over the snapshots if needed * * If the name represents a snapshot, a check is performed if the snapshot is * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ void zvol_create_minors_recursive(const char *name) { list_t minors_list; minors_job_t *job; if (zvol_inhibit_dev) return; /* * This is the list for prefetch jobs. Whenever we found a match * during dmu_objset_find, we insert a minors_job to the list and do * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need * any lock because all list operation is done on the current thread. * * We will use this list to do zvol_os_create_minor after prefetch * so we don't have to traverse using dmu_objset_find again. */ list_create(&minors_list, sizeof (minors_job_t), offsetof(minors_job_t, link)); if (strchr(name, '@') != NULL) { uint64_t snapdev; int error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) (void) zvol_os_create_minor(name); } else { fstrans_cookie_t cookie = spl_fstrans_mark(); (void) dmu_objset_find(name, zvol_create_minors_cb, &minors_list, DS_FIND_CHILDREN); spl_fstrans_unmark(cookie); } taskq_wait_outstanding(system_taskq, 0); /* * Prefetch is completed, we can do zvol_os_create_minor * sequentially. */ while ((job = list_remove_head(&minors_list)) != NULL) { if (!job->error) (void) zvol_os_create_minor(job->name); kmem_strfree(job->name); kmem_free(job, sizeof (minors_job_t)); } list_destroy(&minors_list); } void zvol_create_minor(const char *name) { /* * Note: the dsl_pool_config_lock must not be held. * Minor node creation needs to obtain the zvol_state_lock. * zvol_open() obtains the zvol_state_lock and then the dsl pool * config lock. Therefore, we can't have the config lock now if * we are going to wait for the zvol_state_lock, because it * would be a lock order inversion which could lead to deadlock. */ if (zvol_inhibit_dev) return; if (strchr(name, '@') != NULL) { uint64_t snapdev; int error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) (void) zvol_os_create_minor(name); } else { (void) zvol_os_create_minor(name); } } /* * Remove minors for specified dataset including children and snapshots. */ static void zvol_free_task(void *arg) { zvol_os_free(arg); } void zvol_remove_minors_impl(const char *name) { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); taskqid_t t; list_t free_list; if (zvol_inhibit_dev) return; list_create(&free_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_enter(&zvol_state_lock, RW_WRITER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (name == NULL || strcmp(zv->zv_name, name) == 0 || (strncmp(zv->zv_name, name, namelen) == 0 && (zv->zv_name[namelen] == '/' || zv->zv_name[namelen] == '@'))) { /* * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { mutex_exit(&zv->zv_state_lock); continue; } zvol_remove(zv); /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ zvol_os_clear_private(zv); /* Drop zv_state_lock before zvol_free() */ mutex_exit(&zv->zv_state_lock); /* Try parallel zv_free, if failed do it in place */ t = taskq_dispatch(system_taskq, zvol_free_task, zv, TQ_SLEEP); if (t == TASKQID_INVALID) list_insert_head(&free_list, zv); } else { mutex_exit(&zv->zv_state_lock); } } rw_exit(&zvol_state_lock); /* Drop zvol_state_lock before calling zvol_free() */ while ((zv = list_remove_head(&free_list)) != NULL) zvol_os_free(zv); } /* Remove minor for this specific volume only */ static void zvol_remove_minor_impl(const char *name) { zvol_state_t *zv = NULL, *zv_next; if (zvol_inhibit_dev) return; rw_enter(&zvol_state_lock, RW_WRITER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, name) == 0) { /* * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { mutex_exit(&zv->zv_state_lock); continue; } zvol_remove(zv); zvol_os_clear_private(zv); mutex_exit(&zv->zv_state_lock); break; } else { mutex_exit(&zv->zv_state_lock); } } /* Drop zvol_state_lock before calling zvol_free() */ rw_exit(&zvol_state_lock); if (zv != NULL) zvol_os_free(zv); } /* * Rename minors for specified dataset including children and snapshots. */ static void zvol_rename_minors_impl(const char *oldname, const char *newname) { zvol_state_t *zv, *zv_next; int oldnamelen; if (zvol_inhibit_dev) return; oldnamelen = strlen(oldname); rw_enter(&zvol_state_lock, RW_READER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, oldname) == 0) { zvol_os_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { char *name = kmem_asprintf("%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); zvol_os_rename_minor(zv, name); kmem_strfree(name); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); } typedef struct zvol_snapdev_cb_arg { uint64_t snapdev; } zvol_snapdev_cb_arg_t; static int zvol_set_snapdev_cb(const char *dsname, void *param) { zvol_snapdev_cb_arg_t *arg = param; if (strchr(dsname, '@') == NULL) return (0); switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: (void) zvol_os_create_minor(dsname); break; case ZFS_SNAPDEV_HIDDEN: (void) zvol_remove_minor_impl(dsname); break; } return (0); } static void zvol_set_snapdev_impl(char *name, uint64_t snapdev) { zvol_snapdev_cb_arg_t arg = {snapdev}; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * The zvol_set_snapdev_sync() sets snapdev appropriately * in the dataset hierarchy. Here, we only scan snapshots. */ dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS); spl_fstrans_unmark(cookie); } static void zvol_set_volmode_impl(char *name, uint64_t volmode) { fstrans_cookie_t cookie; uint64_t old_volmode; zvol_state_t *zv; if (strchr(name, '@') != NULL) return; /* * It's unfortunate we need to remove minors before we create new ones: * this is necessary because our backing gendisk (zvol_state->zv_disk) * could be different when we set, for instance, volmode from "geom" * to "dev" (or vice versa). */ zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL && volmode == ZFS_VOLMODE_NONE) return; if (zv != NULL) { old_volmode = zv->zv_volmode; mutex_exit(&zv->zv_state_lock); if (old_volmode == volmode) return; zvol_wait_close(zv); } cookie = spl_fstrans_mark(); switch (volmode) { case ZFS_VOLMODE_NONE: (void) zvol_remove_minor_impl(name); break; case ZFS_VOLMODE_GEOM: case ZFS_VOLMODE_DEV: (void) zvol_remove_minor_impl(name); (void) zvol_os_create_minor(name); break; case ZFS_VOLMODE_DEFAULT: (void) zvol_remove_minor_impl(name); if (zvol_volmode == ZFS_VOLMODE_NONE) break; else /* if zvol_volmode is invalid defaults to "geom" */ (void) zvol_os_create_minor(name); break; } spl_fstrans_unmark(cookie); } static zvol_task_t * zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, uint64_t value) { zvol_task_t *task; /* Never allow tasks on hidden names. */ if (name1[0] == '$') return (NULL); task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->op = op; task->value = value; strlcpy(task->name1, name1, sizeof (task->name1)); if (name2 != NULL) strlcpy(task->name2, name2, sizeof (task->name2)); return (task); } static void zvol_task_free(zvol_task_t *task) { kmem_free(task, sizeof (zvol_task_t)); } /* * The worker thread function performed asynchronously. */ static void zvol_task_cb(void *arg) { zvol_task_t *task = arg; switch (task->op) { case ZVOL_ASYNC_REMOVE_MINORS: zvol_remove_minors_impl(task->name1); break; case ZVOL_ASYNC_RENAME_MINORS: zvol_rename_minors_impl(task->name1, task->name2); break; case ZVOL_ASYNC_SET_SNAPDEV: zvol_set_snapdev_impl(task->name1, task->value); break; case ZVOL_ASYNC_SET_VOLMODE: zvol_set_volmode_impl(task->name1, task->value); break; default: VERIFY(0); break; } zvol_task_free(task); } typedef struct zvol_set_prop_int_arg { const char *zsda_name; uint64_t zsda_value; zprop_source_t zsda_source; zfs_prop_t zsda_prop; } zvol_set_prop_int_arg_t; /* * Sanity check the dataset for safe use by the sync task. No additional * conditions are imposed. */ static int zvol_set_common_check(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; int error; error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); if (error != 0) return (error); dsl_dir_rele(dd, FTAG); return (error); } static int zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { zvol_set_prop_int_arg_t *zsda = arg; char dsname[ZFS_MAX_DATASET_NAME_LEN]; zvol_task_t *task; uint64_t prop; const char *prop_name = zfs_prop_to_name(zsda->zsda_prop); dsl_dataset_name(ds, dsname); if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0) return (0); switch (zsda->zsda_prop) { case ZFS_PROP_VOLMODE: task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, prop); break; case ZFS_PROP_SNAPDEV: task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, prop); break; default: task = NULL; break; } if (task == NULL) return (0); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); } /* * Traverse all child datasets and apply the property appropriately. * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel * dataset and read the effective "property" on every child in the callback * function: this is because the value is not guaranteed to be the same in the * whole dataset hierarchy. */ static void zvol_set_common_sync(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; dsl_dataset_t *ds; int error; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); if (error == 0) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(zsda->zsda_prop), zsda->zsda_source, sizeof (zsda->zsda_value), 1, &zsda->zsda_value, tx); dsl_dataset_rele(ds, FTAG); } dmu_objset_find_dp(dp, dd->dd_object, zvol_set_common_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source, uint64_t val) { zvol_set_prop_int_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; zsda.zsda_value = val; zsda.zsda_prop = prop; return (dsl_sync_task(ddname, zvol_set_common_check, zvol_set_common_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } void zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } boolean_t zvol_is_zvol(const char *name) { return (zvol_os_is_zvol(name)); } int zvol_init_impl(void) { int i; list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL); zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), KM_SLEEP); for (i = 0; i < ZVOL_HT_SIZE; i++) INIT_HLIST_HEAD(&zvol_htable[i]); return (0); } void zvol_fini_impl(void) { zvol_remove_minors_impl(NULL); /* * The call to "zvol_remove_minors_impl" may dispatch entries to * the system_taskq, but it doesn't wait for those entries to * complete before it returns. Thus, we must wait for all of the * removals to finish, before we can continue. */ taskq_wait_outstanding(system_taskq, 0); kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); list_destroy(&zvol_state_list); rw_destroy(&zvol_state_lock); } diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib index 6b83b10d604d..51eff3023e73 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib @@ -1,634 +1,634 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # Copyright (c) 2012, 2019 by Delphix. All rights reserved. # Copyright 2016 Nexenta Systems, Inc. # Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved. # Copyright (c) 2017 Lawrence Livermore National Security, LLC. # Copyright (c) 2017 Datto Inc. # Copyright (c) 2017 Open-E, Inc. All Rights Reserved. # Copyright 2019 Richard Elling # # # Returns SCSI host number for the given disk # function get_scsi_host #disk { typeset disk=$1 ls /sys/block/${disk}/device/scsi_device | cut -d : -f 1 } # # Cause a scan of all scsi host adapters by default # # $1 optional host number # function scan_scsi_hosts { typeset hostnum=${1} if is_linux; then if [[ -z $hostnum ]]; then for host in /sys/class/scsi_host/host*; do log_must eval "echo '- - -' > $host/scan" done else log_note "/sys/class/scsi_host/host$hostnum/scan" log_must eval \ "echo '- - -' > /sys/class/scsi_host/host$hostnum/scan" fi fi } # # Wait for newly created block devices to have their minors created. # Additional arguments can be passed to udevadm trigger, with the expected # arguments to typically be a block device pathname. This is useful when # checking waiting on a specific device to settle rather than triggering # all devices and waiting for them all to settle. # # The udevadm settle timeout can be 120 or 180 seconds by default for # some distros. If a long delay is experienced, it could be due to some # strangeness in a malfunctioning device that isn't related to the devices # under test. To help debug this condition, a notice is given if settle takes # too long. # # Note: there is no meaningful return code if udevadm fails. Consumers # should not expect a return code (do not call as argument to log_must) # function block_device_wait { if is_linux; then udevadm trigger $* 2>/dev/null typeset start=$SECONDS udevadm settle typeset elapsed=$((SECONDS - start)) [[ $elapsed > 60 ]] && \ log_note udevadm settle time too long: $elapsed elif is_freebsd; then if [[ ${#@} -eq 0 ]]; then # Do something that has to go through the geom event # queue to complete. sysctl kern.geom.conftxt >/dev/null return fi fi # Poll for the given paths to appear, but give up eventually. typeset -i i for (( i = 0; i < 5; ++i )); do typeset missing=false typeset dev for dev in "${@}"; do if ! [[ -e $dev ]]; then missing=true break fi done if ! $missing; then break fi sleep ${#@} done } # # Check if the given device is physical device # function is_physical_device #device { typeset device=${1#$DEV_DSKDIR/} device=${device#$DEV_RDSKDIR/} if is_linux; then is_disk_device "$DEV_DSKDIR/$device" && \ [ -f /sys/module/loop/parameters/max_part ] elif is_freebsd; then is_disk_device "$DEV_DSKDIR/$device" && \ echo $device | grep -qE \ -e '^a?da[0-9]+$' \ -e '^md[0-9]+$' \ -e '^mfid[0-9]+$' \ -e '^nda[0-9]+$' \ -e '^nvd[0-9]+$' \ -e '^vtbd[0-9]+$' else echo $device | grep -qE "^c[0-F]+([td][0-F]+)+$" fi } # # Check if the given device is a real device (ie SCSI device) # function is_real_device #disk { typeset disk=$1 [[ -z $disk ]] && log_fail "No argument for disk given." if is_linux; then lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \ grep -q disk fi } # # Check if the given device is a loop device # function is_loop_device #disk { typeset disk=$1 [[ -z $disk ]] && log_fail "No argument for disk given." if is_linux; then lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \ grep -q loop fi } # # Linux: # Check if the given device is a multipath device and if there is a symbolic # link to a device mapper and to a disk # Currently no support for dm devices alone without multipath # # FreeBSD: # Check if the given device is a gmultipath device. # # Others: # No multipath detection. # function is_mpath_device #disk { typeset disk=$1 [[ -z $disk ]] && log_fail "No argument for disk given." if is_linux; then if lsblk $DEV_MPATHDIR/$disk -o TYPE 2>/dev/null | \ grep -q mpath; then readlink $DEV_MPATHDIR/$disk > /dev/null 2>&1 else false fi elif is_freebsd; then is_disk_device $DEV_MPATHDIR/$disk else false fi } # # Check if the given path is the appropriate sort of device special node. # function is_disk_device #path { typeset path=$1 if is_freebsd; then # FreeBSD doesn't have block devices, only character devices. test -c $path else test -b $path fi } # Set the slice prefix for disk partitioning depending # on whether the device is a real, multipath, or loop device. # Currently all disks have to be of the same type, so only # checks first disk to determine slice prefix. # function set_slice_prefix { typeset disk typeset -i i=0 if is_linux; then while (( i < $DISK_ARRAY_NUM )); do disk="$(echo $DISKS | awk '{print $(i + 1)}')" if is_mpath_device $disk && ! echo $disk | awk 'substr($1,18,1) ~ /^[[:digit:]]+$/ {exit 1}' || is_real_device $disk; then export SLICE_PREFIX="" return 0 elif is_mpath_device $disk || is_loop_device $disk; then export SLICE_PREFIX="p" return 0 else log_fail "$disk not supported for partitioning." fi (( i = i + 1)) done fi } # # Set the directory path of the listed devices in $DISK_ARRAY_NUM # Currently all disks have to be of the same type, so only # checks first disk to determine device directory # default = /dev (linux) # real disk = /dev (linux) # multipath device = /dev/mapper (linux) # function set_device_dir { typeset disk typeset -i i=0 if is_linux; then while (( i < $DISK_ARRAY_NUM )); do disk="$(echo $DISKS | awk '{print $(i + 1)}')" if is_mpath_device $disk; then export DEV_DSKDIR=$DEV_MPATHDIR return 0 else export DEV_DSKDIR=$DEV_RDSKDIR return 0 fi (( i = i + 1)) done else export DEV_DSKDIR=$DEV_RDSKDIR fi } # # Get the directory path of given device # function get_device_dir #device { typeset device=$1 if ! is_freebsd && ! is_physical_device $device; then if [[ $device != "/" ]]; then device=${device%/*} fi if is_disk_device "$DEV_DSKDIR/$device"; then device="$DEV_DSKDIR" fi echo $device else echo "$DEV_DSKDIR" fi } # # Get persistent name for given disk # function get_persistent_disk_name #device { typeset device=$1 if is_linux; then if is_real_device $device; then udevadm info -q all -n $DEV_DSKDIR/$device \ - | awk '/disk\/by-id/ {print $2; exit}' | cut -d/ -f3 + | awk '/disk\/by-id/ {print $2; exit}' | cut -d/ -f3- elif is_mpath_device $device; then udevadm info -q all -n $DEV_DSKDIR/$device \ | awk '/disk\/by-id\/dm-uuid/ {print $2; exit}' \ | cut -d/ -f3 else echo $device fi else echo $device fi } # # Online or offline a disk on the system # # First checks state of disk. Test will fail if disk is not properly onlined # or offlined. Online is a full rescan of SCSI disks by echoing to every # host entry. # function on_off_disk # disk state{online,offline} host { typeset disk=$1 typeset state=$2 typeset host=$3 [[ -z $disk ]] || [[ -z $state ]] && \ log_fail "Arguments invalid or missing" if is_linux; then if [[ $state == "offline" ]] && ( is_mpath_device $disk ); then dm_name="$(readlink $DEV_DSKDIR/$disk | cut -d/ -f2)" dep="$(ls /sys/block/${dm_name}/slaves | awk '{print $1}')" while [[ -n $dep ]]; do #check if disk is online if lsscsi | grep -qF $dep; then dep_dir="/sys/block/${dm_name}" dep_dir+="/slaves/${dep}/device" ss="${dep_dir}/state" sd="${dep_dir}/delete" log_must eval "echo 'offline' > ${ss}" log_must eval "echo '1' > ${sd}" if lsscsi | grep -qF $dep; then log_fail "Offlining $disk failed" fi fi dep="$(ls /sys/block/$dm_name/slaves 2>/dev/null | awk '{print $1}')" done elif [[ $state == "offline" ]] && ( is_real_device $disk ); then #check if disk is online if lsscsi | grep -qF $disk; then dev_state="/sys/block/$disk/device/state" dev_delete="/sys/block/$disk/device/delete" log_must eval "echo 'offline' > ${dev_state}" log_must eval "echo '1' > ${dev_delete}" if lsscsi | grep -qF $disk; then log_fail "Offlining $disk failed" fi else log_note "$disk is already offline" fi elif [[ $state == "online" ]]; then #force a full rescan scan_scsi_hosts $host block_device_wait if is_mpath_device $disk; then dm_name="$(readlink $DEV_DSKDIR/$disk | cut -d/ -f2)" dep="$(ls /sys/block/$dm_name/slaves | awk '{print $1}')" if lsscsi | grep -qF $dep; then log_fail "Onlining $disk failed" fi elif is_real_device $disk; then block_device_wait typeset -i retries=0 while ! lsscsi | grep -qF $disk; do if (( $retries > 2 )); then log_fail "Onlining $disk failed" break fi (( ++retries )) sleep 1 done else log_fail "$disk is not a real dev" fi else log_fail "$disk failed to $state" fi fi } # # Simulate disk removal # function remove_disk #disk { typeset disk=$1 on_off_disk $disk "offline" block_device_wait } # # Simulate disk insertion for the given SCSI host # function insert_disk #disk scsi_host { typeset disk=$1 typeset scsi_host=$2 on_off_disk $disk "online" $scsi_host block_device_wait } # # Load scsi_debug module with specified parameters # $blksz can be either one of: < 512b | 512e | 4Kn > # function load_scsi_debug # dev_size_mb add_host num_tgts max_luns blksz { typeset devsize=$1 typeset hosts=$2 typeset tgts=$3 typeset luns=$4 typeset blksz=$5 [[ -z $devsize ]] || [[ -z $hosts ]] || [[ -z $tgts ]] || \ [[ -z $luns ]] || [[ -z $blksz ]] && \ log_fail "Arguments invalid or missing" case "$5" in '512b') typeset sector=512 typeset blkexp=0 ;; '512e') typeset sector=512 typeset blkexp=3 ;; '4Kn') typeset sector=4096 typeset blkexp=0 ;; *) log_fail "Unsupported blksz value: $5" ;; esac if is_linux; then modprobe -n scsi_debug || log_unsupported "Platform does not have scsi_debug module" if lsmod | grep -q scsi_debug; then log_fail "scsi_debug module already installed" else log_must modprobe scsi_debug dev_size_mb=$devsize \ add_host=$hosts num_tgts=$tgts max_luns=$luns \ sector_size=$sector physblk_exp=$blkexp block_device_wait if ! lsscsi | grep -q scsi_debug; then log_fail "scsi_debug module install failed" fi fi fi } # # Unload scsi_debug module, if needed. # function unload_scsi_debug { log_must_retry "in use" 5 modprobe -r scsi_debug } # # Get scsi_debug device name. # Returns basename of scsi_debug device (for example "sdb"). # function get_debug_device { for i in {1..10} ; do val=$(lsscsi | awk '/scsi_debug/ {print $6; exit}' | cut -d/ -f3) # lsscsi can take time to settle if [ "$val" != "-" ] ; then break fi sleep 1 done echo "$val" } # # Get actual devices used by the pool (i.e. linux sdb1 not sdb). # function get_pool_devices #testpool #devdir { typeset testpool=$1 typeset devdir=$2 typeset out="" case "$UNAME" in Linux|FreeBSD) zpool status -P $testpool | awk -v d="$devdir" '$1 ~ d {sub(d "/", ""); printf("%s ", $1)}' ;; esac } # # Write to standard out giving the level, device name, offset and length # of all blocks in an input file. The offset and length are in units of # 512 byte blocks. In the case of mirrored vdevs, only the first # device is listed, as the levels, blocks and offsets will be the same # on other devices. Note that this function only works with mirrored # or non-redundant pools, not raidz. # # The output of this function can be used to introduce corruption at # varying levels of indirection. # function list_file_blocks # input_file { typeset input_file=$1 [[ -f $input_file ]] || log_fail "Couldn't find $input_file" typeset ds="$(zfs list -H -o name $input_file)" typeset pool="${ds%%/*}" typeset objnum="$(get_objnum $input_file)" # # Establish a mapping between vdev ids as shown in a DVA and the # pathnames they correspond to in ${VDEV_MAP[][]}. # # The vdev bits in a DVA refer to the top level vdev id. # ${VDEV_MAP[$id]} is an array of the vdev paths within that vdev. # eval $(zdb -C $pool | awk ' BEGIN { printf "typeset -a VDEV_MAP;" } function subscript(s) { # "[#]" is more convenient than the bare "#" match(s, /\[[0-9]*\]/) return substr(s, RSTART, RLENGTH) } id && !/^ / { # left a top level vdev id = 0 } id && $1 ~ /^path:$/ { # found a vdev path; save it in the map printf "VDEV_MAP%s%s=%s;", id, child, $2 } /^ children/ { # entering a top level vdev id = subscript($0) child = "[0]" # default in case there is no nested vdev printf "typeset -a VDEV_MAP%s;", id } /^ children/ { # entering a nested vdev (e.g. child of a top level mirror) child = subscript($0) } ') # # The awk below parses the output of zdb, printing out the level # of each block along with vdev id, offset and length. The last # two are converted to decimal in the while loop. 4M is added to # the offset to compensate for the first two labels and boot # block. Lastly, the offset and length are printed in units of # 512B blocks for ease of use with dd. # typeset level vdev path offset length if awk -n '' 2>/dev/null; then # gawk needs -n to decode hex AWK='awk -n' else AWK='awk' fi sync_all_pools true zdb -dddddd $ds $objnum | $AWK -v pad=$((4<<20)) -v bs=512 ' /^$/ { looking = 0 } looking { level = $2 field = 3 while (split($field, dva, ":") == 3) { # top level vdev id vdev = int(dva[1]) # offset + 4M label/boot pad in 512B blocks offset = (int("0x"dva[2]) + pad) / bs # length in 512B blocks len = int("0x"dva[3]) / bs print level, vdev, offset, len ++field } } /^Indirect blocks:/ { looking = 1 } ' | \ while read level vdev offset length; do for path in ${VDEV_MAP[$vdev][@]}; do echo "$level $path $offset $length" done done 2>/dev/null } function corrupt_blocks_at_level # input_file corrupt_level { typeset input_file=$1 typeset corrupt_level="L${2:-0}" typeset level path offset length [[ -f $input_file ]] || log_fail "Couldn't find $input_file" if is_freebsd; then # Temporarily allow corrupting an inuse device. debugflags=$(sysctl -n kern.geom.debugflags) sysctl kern.geom.debugflags=16 fi list_file_blocks $input_file | \ while read level path offset length; do if [[ $level = $corrupt_level ]]; then log_must dd if=/dev/urandom of=$path bs=512 \ count=$length seek=$offset conv=notrunc fi done if is_freebsd; then sysctl kern.geom.debugflags=$debugflags fi # This is necessary for pools made of loop devices. sync } function corrupt_label_checksum # label_number vdev_path { typeset label_size=$((256*1024)) typeset vdev_size=$(stat_size ${2}) typeset -a offsets=("$((128*1024 - 32))" \ "$(($label_size + (128*1024 - 32)))" \ "$(($vdev_size - $label_size - (128*1024 + 32)))" \ "$(($vdev_size - (128*1024 + 32)))") dd if=/dev/urandom of=${2} seek=${offsets[$1]} bs=1 count=32 \ conv=notrunc } diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index aab672a77324..24fb572f00b0 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -1,1149 +1,1149 @@ /* */ /* zfs_config.h. Generated from zfs_config.h.in by configure. */ /* zfs_config.h.in. Generated from configure.ac by autoheader. */ /* Define to 1 if translation of program messages to the user's native language is requested. */ /* #undef ENABLE_NLS */ /* bio_end_io_t wants 1 arg */ /* #undef HAVE_1ARG_BIO_END_IO_T */ /* lookup_bdev() wants 1 arg */ /* #undef HAVE_1ARG_LOOKUP_BDEV */ /* submit_bio() wants 1 arg */ /* #undef HAVE_1ARG_SUBMIT_BIO */ /* bdi_setup_and_register() wants 2 args */ /* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 2 args */ /* #undef HAVE_2ARGS_VFS_GETATTR */ /* zlib_deflate_workspacesize() wants 2 args */ /* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */ /* bdi_setup_and_register() wants 3 args */ /* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 3 args */ /* #undef HAVE_3ARGS_VFS_GETATTR */ /* vfs_getattr wants 4 args */ /* #undef HAVE_4ARGS_VFS_GETATTR */ /* kernel has access_ok with 'type' parameter */ /* #undef HAVE_ACCESS_OK_TYPE */ /* posix_acl has refcount_t */ /* #undef HAVE_ACL_REFCOUNT */ /* add_disk() returns int */ /* #undef HAVE_ADD_DISK_RET */ /* Define if host toolchain supports AES */ #define HAVE_AES 1 /* Define if you have [rt] */ #define HAVE_AIO_H 1 #ifdef __amd64__ #ifndef RESCUE /* Define if host toolchain supports AVX */ #define HAVE_AVX 1 #endif /* Define if host toolchain supports AVX2 */ #define HAVE_AVX2 1 /* Define if host toolchain supports AVX512BW */ #define HAVE_AVX512BW 1 /* Define if host toolchain supports AVX512CD */ #define HAVE_AVX512CD 1 /* Define if host toolchain supports AVX512DQ */ #define HAVE_AVX512DQ 1 /* Define if host toolchain supports AVX512ER */ #define HAVE_AVX512ER 1 /* Define if host toolchain supports AVX512F */ #define HAVE_AVX512F 1 /* Define if host toolchain supports AVX512IFMA */ #define HAVE_AVX512IFMA 1 /* Define if host toolchain supports AVX512PF */ #define HAVE_AVX512PF 1 /* Define if host toolchain supports AVX512VBMI */ #define HAVE_AVX512VBMI 1 /* Define if host toolchain supports AVX512VL */ #define HAVE_AVX512VL 1 #endif /* bdevname() is available */ /* #undef HAVE_BDEVNAME */ /* bdev_check_media_change() exists */ /* #undef HAVE_BDEV_CHECK_MEDIA_CHANGE */ /* bdev_*_io_acct() available */ /* #undef HAVE_BDEV_IO_ACCT_63 */ /* bdev_*_io_acct() available */ /* #undef HAVE_BDEV_IO_ACCT_OLD */ /* bdev_kobj() exists */ /* #undef HAVE_BDEV_KOBJ */ /* bdev_max_discard_sectors() is available */ /* #undef HAVE_BDEV_MAX_DISCARD_SECTORS */ /* bdev_max_secure_erase_sectors() is available */ /* #undef HAVE_BDEV_MAX_SECURE_ERASE_SECTORS */ /* block_device_operations->submit_bio() returns void */ /* #undef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID */ /* bdev_whole() is available */ /* #undef HAVE_BDEV_WHOLE */ /* bio_alloc() takes 4 arguments */ /* #undef HAVE_BIO_ALLOC_4ARG */ /* bio->bi_bdev->bd_disk exists */ /* #undef HAVE_BIO_BDEV_DISK */ /* bio->bi_opf is defined */ /* #undef HAVE_BIO_BI_OPF */ /* bio->bi_status exists */ /* #undef HAVE_BIO_BI_STATUS */ /* bio has bi_iter */ /* #undef HAVE_BIO_BVEC_ITER */ /* bio_*_io_acct() available */ /* #undef HAVE_BIO_IO_ACCT */ /* bio_max_segs() is implemented */ /* #undef HAVE_BIO_MAX_SEGS */ /* bio_set_dev() is available */ /* #undef HAVE_BIO_SET_DEV */ /* bio_set_dev() GPL-only */ /* #undef HAVE_BIO_SET_DEV_GPL_ONLY */ /* bio_set_dev() is a macro */ /* #undef HAVE_BIO_SET_DEV_MACRO */ /* bio_set_op_attrs is available */ /* #undef HAVE_BIO_SET_OP_ATTRS */ /* blkdev_get_by_path() exists and takes 4 args */ /* #undef HAVE_BLKDEV_GET_BY_PATH_4ARG */ /* blkdev_get_by_path() handles ERESTARTSYS */ /* #undef HAVE_BLKDEV_GET_ERESTARTSYS */ /* blkdev_issue_discard() is available */ /* #undef HAVE_BLKDEV_ISSUE_DISCARD */ /* blkdev_issue_secure_erase() is available */ /* #undef HAVE_BLKDEV_ISSUE_SECURE_ERASE */ /* blkdev_put() accepts void* as arg 2 */ /* #undef HAVE_BLKDEV_PUT_HOLDER */ /* blkdev_reread_part() exists */ /* #undef HAVE_BLKDEV_REREAD_PART */ /* blkg_tryget() is available */ /* #undef HAVE_BLKG_TRYGET */ /* blkg_tryget() GPL-only */ /* #undef HAVE_BLKG_TRYGET_GPL_ONLY */ /* blk_alloc_disk() exists */ /* #undef HAVE_BLK_ALLOC_DISK */ /* blk_alloc_queue() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */ /* blk_alloc_queue_rh() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH */ /* blk_cleanup_disk() exists */ /* #undef HAVE_BLK_CLEANUP_DISK */ /* blk_mode_t is defined */ /* #undef HAVE_BLK_MODE_T */ /* block multiqueue is available */ /* #undef HAVE_BLK_MQ */ /* blk queue backing_dev_info is dynamic */ /* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */ /* blk_queue_discard() is available */ /* #undef HAVE_BLK_QUEUE_DISCARD */ /* blk_queue_flag_clear() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */ /* blk_queue_flag_set() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_SET */ /* blk_queue_flush() is available */ /* #undef HAVE_BLK_QUEUE_FLUSH */ /* blk_queue_flush() is GPL-only */ /* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */ /* blk_queue_secdiscard() is available */ /* #undef HAVE_BLK_QUEUE_SECDISCARD */ /* blk_queue_secure_erase() is available */ /* #undef HAVE_BLK_QUEUE_SECURE_ERASE */ /* blk_queue_update_readahead() exists */ /* #undef HAVE_BLK_QUEUE_UPDATE_READAHEAD */ /* blk_queue_write_cache() exists */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE */ /* blk_queue_write_cache() is GPL-only */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */ /* BLK_STS_RESV_CONFLICT is defined */ /* #undef HAVE_BLK_STS_RESV_CONFLICT */ /* Define if release() in block_device_operations takes 1 arg */ /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG */ /* Define if revalidate_disk() in block_device_operations */ /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYCURRENT */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */ /* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in the CoreFoundation framework. */ /* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */ /* check_disk_change() exists */ /* #undef HAVE_CHECK_DISK_CHANGE */ /* clear_inode() is available */ /* #undef HAVE_CLEAR_INODE */ /* dentry uses const struct dentry_operations */ /* #undef HAVE_CONST_DENTRY_OPERATIONS */ /* copy_from_iter() is available */ /* #undef HAVE_COPY_FROM_ITER */ /* copy_splice_read exists */ /* #undef HAVE_COPY_SPLICE_READ */ /* copy_to_iter() is available */ /* #undef HAVE_COPY_TO_ITER */ /* cpu_has_feature() is GPL-only */ /* #undef HAVE_CPU_HAS_FEATURE_GPL_ONLY */ /* yes */ /* #undef HAVE_CPU_HOTPLUG */ /* current_time() exists */ /* #undef HAVE_CURRENT_TIME */ /* Define if the GNU dcgettext() function is already present or preinstalled. */ /* #undef HAVE_DCGETTEXT */ /* DECLARE_EVENT_CLASS() is available */ /* #undef HAVE_DECLARE_EVENT_CLASS */ /* dentry aliases are in d_u member */ /* #undef HAVE_DENTRY_D_U_ALIASES */ /* dequeue_signal() takes 4 arguments */ /* #undef HAVE_DEQUEUE_SIGNAL_4ARG */ /* lookup_bdev() wants dev_t arg */ /* #undef HAVE_DEVT_LOOKUP_BDEV */ /* sops->dirty_inode() wants flags */ /* #undef HAVE_DIRTY_INODE_WITH_FLAGS */ /* disk_check_media_change() exists */ /* #undef HAVE_DISK_CHECK_MEDIA_CHANGE */ /* disk_*_io_acct() available */ /* #undef HAVE_DISK_IO_ACCT */ /* disk_update_readahead() exists */ /* #undef HAVE_DISK_UPDATE_READAHEAD */ /* Define to 1 if you have the header file. */ #define HAVE_DLFCN_H 1 /* d_make_root() is available */ /* #undef HAVE_D_MAKE_ROOT */ /* d_prune_aliases() is available */ /* #undef HAVE_D_PRUNE_ALIASES */ /* dops->d_revalidate() operation takes nameidata */ /* #undef HAVE_D_REVALIDATE_NAMEIDATA */ /* eops->encode_fh() wants child and parent inodes */ /* #undef HAVE_ENCODE_FH_WITH_INODE */ /* sops->evict_inode() exists */ /* #undef HAVE_EVICT_INODE */ /* FALLOC_FL_ZERO_RANGE is defined */ /* #undef HAVE_FALLOC_FL_ZERO_RANGE */ /* fault_in_iov_iter_readable() is available */ /* #undef HAVE_FAULT_IN_IOV_ITER_READABLE */ /* filemap_range_has_page() is available */ /* #undef HAVE_FILEMAP_RANGE_HAS_PAGE */ /* fops->aio_fsync() exists */ /* #undef HAVE_FILE_AIO_FSYNC */ /* file_dentry() is available */ /* #undef HAVE_FILE_DENTRY */ /* fops->fadvise() exists */ /* #undef HAVE_FILE_FADVISE */ /* file_inode() is available */ /* #undef HAVE_FILE_INODE */ /* flush_dcache_page() is GPL-only */ /* #undef HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY */ /* iops->follow_link() cookie */ /* #undef HAVE_FOLLOW_LINK_COOKIE */ /* iops->follow_link() nameidata */ /* #undef HAVE_FOLLOW_LINK_NAMEIDATA */ /* Define if compiler supports -Wformat-overflow */ /* #undef HAVE_FORMAT_OVERFLOW */ /* fsync_bdev() is declared in include/blkdev.h */ /* #undef HAVE_FSYNC_BDEV */ /* fops->fsync() with range */ /* #undef HAVE_FSYNC_RANGE */ /* fops->fsync() without dentry */ /* #undef HAVE_FSYNC_WITHOUT_DENTRY */ /* yes */ /* #undef HAVE_GENERIC_FADVISE */ /* generic_fillattr requires struct mnt_idmap* */ /* #undef HAVE_GENERIC_FILLATTR_IDMAP */ /* generic_fillattr requires struct mnt_idmap* and u32 request_mask */ /* #undef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK */ /* generic_fillattr requires struct user_namespace* */ /* #undef HAVE_GENERIC_FILLATTR_USERNS */ /* generic_*_io_acct() 3 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_3ARG */ /* generic_*_io_acct() 4 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_4ARG */ /* generic_readlink is global */ /* #undef HAVE_GENERIC_READLINK */ /* generic_setxattr() exists */ /* #undef HAVE_GENERIC_SETXATTR */ /* generic_write_checks() takes kiocb */ /* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */ /* Define if the GNU gettext() function is already present or preinstalled. */ /* #undef HAVE_GETTEXT */ /* iops->get_acl() exists */ /* #undef HAVE_GET_ACL */ /* iops->get_acl() takes rcu */ /* #undef HAVE_GET_ACL_RCU */ /* has iops->get_inode_acl() */ /* #undef HAVE_GET_INODE_ACL */ /* iops->get_link() cookie */ /* #undef HAVE_GET_LINK_COOKIE */ /* iops->get_link() delayed */ /* #undef HAVE_GET_LINK_DELAYED */ /* group_info->gid exists */ /* #undef HAVE_GROUP_INFO_GID */ /* has_capability() is available */ /* #undef HAVE_HAS_CAPABILITY */ /* iattr->ia_vfsuid and iattr->ia_vfsgid exist */ /* #undef HAVE_IATTR_VFSID */ /* Define if you have the iconv() function and it works. */ #define HAVE_ICONV 1 /* iops->getattr() takes struct mnt_idmap* */ /* #undef HAVE_IDMAP_IOPS_GETATTR */ /* iops->setattr() takes struct mnt_idmap* */ /* #undef HAVE_IDMAP_IOPS_SETATTR */ /* APIs for idmapped mount are present */ /* #undef HAVE_IDMAP_MNT_API */ /* Define if compiler supports -Wimplicit-fallthrough */ /* #undef HAVE_IMPLICIT_FALLTHROUGH */ /* Define if compiler supports -Winfinite-recursion */ /* #undef HAVE_INFINITE_RECURSION */ /* inode_get_ctime() exists in linux/fs.h */ /* #undef HAVE_INODE_GET_CTIME */ /* yes */ /* #undef HAVE_INODE_LOCK_SHARED */ /* inode_owner_or_capable() exists */ /* #undef HAVE_INODE_OWNER_OR_CAPABLE */ /* inode_owner_or_capable() takes mnt_idmap */ /* #undef HAVE_INODE_OWNER_OR_CAPABLE_IDMAP */ /* inode_owner_or_capable() takes user_ns */ /* #undef HAVE_INODE_OWNER_OR_CAPABLE_USERNS */ /* inode_set_ctime_to_ts() exists in linux/fs.h */ /* #undef HAVE_INODE_SET_CTIME_TO_TS */ /* inode_set_flags() exists */ /* #undef HAVE_INODE_SET_FLAGS */ /* inode_set_iversion() exists */ /* #undef HAVE_INODE_SET_IVERSION */ /* inode->i_*time's are timespec64 */ /* #undef HAVE_INODE_TIMESPEC64_TIMES */ /* timestamp_truncate() exists */ /* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */ /* Define to 1 if you have the header file. */ #define HAVE_INTTYPES_H 1 /* in_compat_syscall() is available */ /* #undef HAVE_IN_COMPAT_SYSCALL */ /* iops->create() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_CREATE_IDMAP */ /* iops->create() takes struct user_namespace* */ /* #undef HAVE_IOPS_CREATE_USERNS */ /* iops->mkdir() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_MKDIR_IDMAP */ /* iops->mkdir() takes struct user_namespace* */ /* #undef HAVE_IOPS_MKDIR_USERNS */ /* iops->mknod() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_MKNOD_IDMAP */ /* iops->mknod() takes struct user_namespace* */ /* #undef HAVE_IOPS_MKNOD_USERNS */ /* iops->permission() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_PERMISSION_IDMAP */ /* iops->permission() takes struct user_namespace* */ /* #undef HAVE_IOPS_PERMISSION_USERNS */ /* iops->rename() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_RENAME_IDMAP */ /* iops->rename() takes struct user_namespace* */ /* #undef HAVE_IOPS_RENAME_USERNS */ /* iops->setattr() exists */ /* #undef HAVE_IOPS_SETATTR */ /* iops->symlink() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_SYMLINK_IDMAP */ /* iops->symlink() takes struct user_namespace* */ /* #undef HAVE_IOPS_SYMLINK_USERNS */ /* iov_iter_advance() is available */ /* #undef HAVE_IOV_ITER_ADVANCE */ /* iov_iter_count() is available */ /* #undef HAVE_IOV_ITER_COUNT */ /* iov_iter_fault_in_readable() is available */ /* #undef HAVE_IOV_ITER_FAULT_IN_READABLE */ /* iov_iter_revert() is available */ /* #undef HAVE_IOV_ITER_REVERT */ /* iov_iter_type() is available */ /* #undef HAVE_IOV_ITER_TYPE */ /* iov_iter types are available */ /* #undef HAVE_IOV_ITER_TYPES */ /* yes */ /* #undef HAVE_IO_SCHEDULE_TIMEOUT */ /* Define to 1 if you have the `issetugid' function. */ #define HAVE_ISSETUGID 1 /* iter_iov() is available */ /* #undef HAVE_ITER_IOV */ /* kernel has kernel_fpu_* functions */ /* #undef HAVE_KERNEL_FPU */ /* kernel has asm/fpu/api.h */ /* #undef HAVE_KERNEL_FPU_API_HEADER */ /* kernel fpu internal */ /* #undef HAVE_KERNEL_FPU_INTERNAL */ /* kernel has asm/fpu/internal.h */ /* #undef HAVE_KERNEL_FPU_INTERNAL_HEADER */ /* uncached_acl_sentinel() exists */ /* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */ /* Define if compiler supports -Winfinite-recursion */ /* #undef HAVE_KERNEL_INFINITE_RECURSION */ /* kernel does stack verification */ /* #undef HAVE_KERNEL_OBJTOOL */ /* kernel has linux/objtool.h */ /* #undef HAVE_KERNEL_OBJTOOL_HEADER */ /* kernel_read() take loff_t pointer */ /* #undef HAVE_KERNEL_READ_PPOS */ /* timer_list.function gets a timer_list */ /* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */ /* struct timer_list has a flags member */ /* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */ /* timer_setup() is available */ /* #undef HAVE_KERNEL_TIMER_SETUP */ /* kernel_write() take loff_t pointer */ /* #undef HAVE_KERNEL_WRITE_PPOS */ /* kmem_cache_create_usercopy() exists */ /* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */ /* kstrtoul() exists */ /* #undef HAVE_KSTRTOUL */ /* ktime_get_coarse_real_ts64() exists */ /* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */ /* ktime_get_raw_ts64() exists */ /* #undef HAVE_KTIME_GET_RAW_TS64 */ /* kvmalloc exists */ /* #undef HAVE_KVMALLOC */ /* Define if you have [aio] */ /* #undef HAVE_LIBAIO */ /* Define if you have [blkid] */ /* #undef HAVE_LIBBLKID */ /* Define if you have [crypto] */ #define HAVE_LIBCRYPTO 1 /* Define if you have [tirpc] */ /* #undef HAVE_LIBTIRPC */ /* Define if you have [udev] */ /* #undef HAVE_LIBUDEV */ /* Define if you have [uuid] */ /* #undef HAVE_LIBUUID */ /* linux/blk-cgroup.h exists */ /* #undef HAVE_LINUX_BLK_CGROUP_HEADER */ /* lseek_execute() is available */ /* #undef HAVE_LSEEK_EXECUTE */ /* makedev() is declared in sys/mkdev.h */ /* #undef HAVE_MAKEDEV_IN_MKDEV */ /* makedev() is declared in sys/sysmacros.h */ /* #undef HAVE_MAKEDEV_IN_SYSMACROS */ /* Noting that make_request_fn() returns blk_qc_t */ /* #undef HAVE_MAKE_REQUEST_FN_RET_QC */ /* Noting that make_request_fn() returns void */ /* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */ /* iops->mkdir() takes umode_t */ /* #undef HAVE_MKDIR_UMODE_T */ /* Define to 1 if you have the `mlockall' function. */ #define HAVE_MLOCKALL 1 /* lookup_bdev() wants mode arg */ /* #undef HAVE_MODE_LOOKUP_BDEV */ /* Define if host toolchain supports MOVBE */ #define HAVE_MOVBE 1 /* new_sync_read()/new_sync_write() are available */ /* #undef HAVE_NEW_SYNC_READ */ /* folio_wait_bit() exists */ /* #undef HAVE_PAGEMAP_FOLIO_WAIT_BIT */ /* part_to_dev() exists */ /* #undef HAVE_PART_TO_DEV */ /* iops->getattr() takes a path */ /* #undef HAVE_PATH_IOPS_GETATTR */ /* Define if host toolchain supports PCLMULQDQ */ #define HAVE_PCLMULQDQ 1 /* percpu_counter_add_batch() is defined */ /* #undef HAVE_PERCPU_COUNTER_ADD_BATCH */ /* percpu_counter_init() wants gfp_t */ /* #undef HAVE_PERCPU_COUNTER_INIT_WITH_GFP */ /* posix_acl_chmod() exists */ /* #undef HAVE_POSIX_ACL_CHMOD */ /* posix_acl_from_xattr() needs user_ns */ /* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */ /* posix_acl_release() is available */ /* #undef HAVE_POSIX_ACL_RELEASE */ /* posix_acl_release() is GPL-only */ /* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */ /* posix_acl_valid() wants user namespace */ /* #undef HAVE_POSIX_ACL_VALID_WITH_NS */ /* proc_ops structure exists */ /* #undef HAVE_PROC_OPS_STRUCT */ /* iops->put_link() cookie */ /* #undef HAVE_PUT_LINK_COOKIE */ /* iops->put_link() delayed */ /* #undef HAVE_PUT_LINK_DELAYED */ /* iops->put_link() nameidata */ /* #undef HAVE_PUT_LINK_NAMEIDATA */ /* If available, contains the Python version number currently in use. */ #define HAVE_PYTHON "3.7" /* qat is enabled and existed */ /* #undef HAVE_QAT */ /* struct reclaim_state has reclaimed */ /* #undef HAVE_RECLAIM_STATE_RECLAIMED */ /* register_shrinker is vararg */ /* #undef HAVE_REGISTER_SHRINKER_VARARG */ /* register_sysctl_table exists */ /* #undef HAVE_REGISTER_SYSCTL_TABLE */ /* iops->rename2() exists */ /* #undef HAVE_RENAME2 */ /* struct inode_operations_wrapper takes .rename2() */ /* #undef HAVE_RENAME2_OPERATIONS_WRAPPER */ /* iops->rename() wants flags */ /* #undef HAVE_RENAME_WANTS_FLAGS */ /* REQ_DISCARD is defined */ /* #undef HAVE_REQ_DISCARD */ /* REQ_FLUSH is defined */ /* #undef HAVE_REQ_FLUSH */ /* REQ_OP_DISCARD is defined */ /* #undef HAVE_REQ_OP_DISCARD */ /* REQ_OP_FLUSH is defined */ /* #undef HAVE_REQ_OP_FLUSH */ /* REQ_OP_SECURE_ERASE is defined */ /* #undef HAVE_REQ_OP_SECURE_ERASE */ /* REQ_PREFLUSH is defined */ /* #undef HAVE_REQ_PREFLUSH */ /* revalidate_disk() is available */ /* #undef HAVE_REVALIDATE_DISK */ /* revalidate_disk_size() is available */ /* #undef HAVE_REVALIDATE_DISK_SIZE */ /* struct rw_semaphore has member activity */ /* #undef HAVE_RWSEM_ACTIVITY */ /* struct rw_semaphore has atomic_long_t member count */ /* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */ /* linux/sched/signal.h exists */ /* #undef HAVE_SCHED_SIGNAL_HEADER */ /* Define to 1 if you have the header file. */ #define HAVE_SECURITY_PAM_MODULES_H 1 /* setattr_prepare() accepts mnt_idmap */ /* #undef HAVE_SETATTR_PREPARE_IDMAP */ /* setattr_prepare() is available, doesn't accept user_namespace */ /* #undef HAVE_SETATTR_PREPARE_NO_USERNS */ /* setattr_prepare() accepts user_namespace */ /* #undef HAVE_SETATTR_PREPARE_USERNS */ /* iops->set_acl() exists, takes 3 args */ /* #undef HAVE_SET_ACL */ /* iops->set_acl() takes 4 args, arg1 is struct mnt_idmap * */ /* #undef HAVE_SET_ACL_IDMAP_DENTRY */ /* iops->set_acl() takes 4 args */ /* #undef HAVE_SET_ACL_USERNS */ /* iops->set_acl() takes 4 args, arg2 is struct dentry * */ /* #undef HAVE_SET_ACL_USERNS_DENTRY_ARG2 */ /* set_cached_acl() is usable */ /* #undef HAVE_SET_CACHED_ACL_USABLE */ /* set_special_state() exists */ /* #undef HAVE_SET_SPECIAL_STATE */ /* struct shrink_control exists */ /* #undef HAVE_SHRINK_CONTROL_STRUCT */ /* kernel_siginfo_t exists */ /* #undef HAVE_SIGINFO */ /* signal_stop() exists */ /* #undef HAVE_SIGNAL_STOP */ /* new shrinker callback wants 2 args */ /* #undef HAVE_SINGLE_SHRINKER_CALLBACK */ /* cs->count_objects exists */ /* #undef HAVE_SPLIT_SHRINKER_CALLBACK */ #if defined(__amd64__) || defined(__i386__) /* Define if host toolchain supports SSE */ #define HAVE_SSE 1 /* Define if host toolchain supports SSE2 */ #define HAVE_SSE2 1 /* Define if host toolchain supports SSE3 */ #define HAVE_SSE3 1 /* Define if host toolchain supports SSE4.1 */ #define HAVE_SSE4_1 1 /* Define if host toolchain supports SSE4.2 */ #define HAVE_SSE4_2 1 /* Define if host toolchain supports SSSE3 */ #define HAVE_SSSE3 1 #endif /* STACK_FRAME_NON_STANDARD is defined */ /* #undef HAVE_STACK_FRAME_NON_STANDARD */ /* standalone exists */ /* #undef HAVE_STANDALONE_LINUX_STDARG */ /* Define to 1 if you have the header file. */ #define HAVE_STDINT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDIO_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDLIB_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRINGS_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRING_H 1 /* Define to 1 if you have the `strlcat' function. */ #define HAVE_STRLCAT 1 /* Define to 1 if you have the `strlcpy' function. */ #define HAVE_STRLCPY 1 /* submit_bio is member of struct block_device_operations */ /* #undef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ /* super_setup_bdi_name() exits */ /* #undef HAVE_SUPER_SETUP_BDI_NAME */ /* super_block->s_user_ns exists */ /* #undef HAVE_SUPER_USER_NS */ /* sync_blockdev() is declared in include/blkdev.h */ /* #undef HAVE_SYNC_BLOCKDEV */ /* struct kobj_type has default_groups */ /* #undef HAVE_SYSFS_DEFAULT_GROUPS */ /* Define to 1 if you have the header file. */ #define HAVE_SYS_STAT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 /* i_op->tmpfile() exists */ /* #undef HAVE_TMPFILE */ /* i_op->tmpfile() uses old dentry signature */ /* #undef HAVE_TMPFILE_DENTRY */ /* i_op->tmpfile() has mnt_idmap */ /* #undef HAVE_TMPFILE_IDMAP */ /* i_op->tmpfile() has userns */ /* #undef HAVE_TMPFILE_USERNS */ /* totalhigh_pages() exists */ /* #undef HAVE_TOTALHIGH_PAGES */ /* kernel has totalram_pages() */ /* #undef HAVE_TOTALRAM_PAGES_FUNC */ /* Define to 1 if you have the `udev_device_get_is_initialized' function. */ /* #undef HAVE_UDEV_DEVICE_GET_IS_INITIALIZED */ /* kernel has __kernel_fpu_* functions */ /* #undef HAVE_UNDERSCORE_KERNEL_FPU */ /* Define to 1 if you have the header file. */ #define HAVE_UNISTD_H 1 /* iops->getattr() takes struct user_namespace* */ /* #undef HAVE_USERNS_IOPS_GETATTR */ /* iops->setattr() takes struct user_namespace* */ /* #undef HAVE_USERNS_IOPS_SETATTR */ /* user_namespace->ns.inum exists */ /* #undef HAVE_USER_NS_COMMON_INUM */ /* iops->getattr() takes a vfsmount */ /* #undef HAVE_VFSMOUNT_IOPS_GETATTR */ /* fops->clone_file_range() is available */ /* #undef HAVE_VFS_CLONE_FILE_RANGE */ /* fops->copy_file_range() is available */ /* #undef HAVE_VFS_COPY_FILE_RANGE */ /* fops->dedupe_file_range() is available */ /* #undef HAVE_VFS_DEDUPE_FILE_RANGE */ /* aops->direct_IO() uses iovec */ /* #undef HAVE_VFS_DIRECT_IO_IOVEC */ /* aops->direct_IO() uses iov_iter without rw */ /* #undef HAVE_VFS_DIRECT_IO_ITER */ /* aops->direct_IO() uses iov_iter with offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */ /* aops->direct_IO() uses iov_iter with rw and offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */ /* filemap_dirty_folio exists */ /* #undef HAVE_VFS_FILEMAP_DIRTY_FOLIO */ /* file_operations_extend takes .copy_file_range() and .clone_file_range() */ /* #undef HAVE_VFS_FILE_OPERATIONS_EXTEND */ /* generic_copy_file_range() is available */ /* #undef HAVE_VFS_GENERIC_COPY_FILE_RANGE */ /* All required iov_iter interfaces are available */ /* #undef HAVE_VFS_IOV_ITER */ /* fops->iterate() is available */ /* #undef HAVE_VFS_ITERATE */ /* fops->iterate_shared() is available */ /* #undef HAVE_VFS_ITERATE_SHARED */ /* fops->readdir() is available */ /* #undef HAVE_VFS_READDIR */ /* address_space_operations->readpages exists */ /* #undef HAVE_VFS_READPAGES */ /* read_folio exists */ /* #undef HAVE_VFS_READ_FOLIO */ /* fops->remap_file_range() is available */ /* #undef HAVE_VFS_REMAP_FILE_RANGE */ /* fops->read/write_iter() are available */ /* #undef HAVE_VFS_RW_ITERATE */ /* __set_page_dirty_nobuffers exists */ /* #undef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS */ /* __vmalloc page flags exists */ /* #undef HAVE_VMALLOC_PAGE_KERNEL */ /* yes */ /* #undef HAVE_WAIT_ON_BIT_ACTION */ /* wait_queue_entry_t exists */ /* #undef HAVE_WAIT_QUEUE_ENTRY_T */ /* wq_head->head and wq_entry->entry exist */ /* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */ /* int (*writepage_t)() takes struct folio* */ /* #undef HAVE_WRITEPAGE_T_FOLIO */ /* xattr_handler->get() wants dentry */ /* #undef HAVE_XATTR_GET_DENTRY */ /* xattr_handler->get() wants both dentry and inode */ /* #undef HAVE_XATTR_GET_DENTRY_INODE */ /* xattr_handler->get() wants dentry and inode and flags */ /* #undef HAVE_XATTR_GET_DENTRY_INODE_FLAGS */ /* xattr_handler->get() wants xattr_handler */ /* #undef HAVE_XATTR_GET_HANDLER */ /* xattr_handler has name */ /* #undef HAVE_XATTR_HANDLER_NAME */ /* xattr_handler->list() wants dentry */ /* #undef HAVE_XATTR_LIST_DENTRY */ /* xattr_handler->list() wants xattr_handler */ /* #undef HAVE_XATTR_LIST_HANDLER */ /* xattr_handler->list() wants simple */ /* #undef HAVE_XATTR_LIST_SIMPLE */ /* xattr_handler->set() wants dentry */ /* #undef HAVE_XATTR_SET_DENTRY */ /* xattr_handler->set() wants both dentry and inode */ /* #undef HAVE_XATTR_SET_DENTRY_INODE */ /* xattr_handler->set() wants xattr_handler */ /* #undef HAVE_XATTR_SET_HANDLER */ /* xattr_handler->set() takes mnt_idmap */ /* #undef HAVE_XATTR_SET_IDMAP */ /* xattr_handler->set() takes user_namespace */ /* #undef HAVE_XATTR_SET_USERNS */ /* Define if host toolchain supports XSAVE */ #define HAVE_XSAVE 1 /* Define if host toolchain supports XSAVEOPT */ #define HAVE_XSAVEOPT 1 /* Define if host toolchain supports XSAVES */ #define HAVE_XSAVES 1 /* ZERO_PAGE() is GPL-only */ /* #undef HAVE_ZERO_PAGE_GPL_ONLY */ /* Define if you have [z] */ #define HAVE_ZLIB 1 /* __posix_acl_chmod() exists */ /* #undef HAVE___POSIX_ACL_CHMOD */ /* kernel exports FPU functions */ /* #undef KERNEL_EXPORTS_X86_FPU */ /* TBD: fetch(3) support */ #if 0 /* whether the chosen libfetch is to be loaded at run-time */ #define LIBFETCH_DYNAMIC 1 /* libfetch is fetch(3) */ #define LIBFETCH_IS_FETCH 1 /* libfetch is libcurl */ #define LIBFETCH_IS_LIBCURL 0 /* soname of chosen libfetch */ #define LIBFETCH_SONAME "libfetch.so.6" #endif /* Define to the sub-directory where libtool stores uninstalled libraries. */ #define LT_OBJDIR ".libs/" /* make_request_fn() return type */ /* #undef MAKE_REQUEST_FN_RET */ /* struct shrink_control has nid */ /* #undef SHRINK_CONTROL_HAS_NID */ /* using complete_and_exit() instead */ /* #undef SPL_KTHREAD_COMPLETE_AND_EXIT */ /* Defined for legacy compatibility. */ #define SPL_META_ALIAS ZFS_META_ALIAS /* Defined for legacy compatibility. */ #define SPL_META_RELEASE ZFS_META_RELEASE /* Defined for legacy compatibility. */ #define SPL_META_VERSION ZFS_META_VERSION /* pde_data() is PDE_DATA() */ /* #undef SPL_PDE_DATA */ /* Define to 1 if all of the C90 standard headers exist (not just the ones required in a freestanding environment). This macro is provided for backward compatibility; new code need not use it. */ #define SYSTEM_FREEBSD 1 /* True if ZFS is to be compiled for a Linux system */ /* #undef SYSTEM_LINUX */ /* Version number of package */ /* #undef ZFS_DEBUG */ /* /dev/zfs minor */ /* #undef ZFS_DEVICE_MINOR */ /* enum node_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */ /* enum node_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum node_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */ /* enum zone_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */ /* enum zone_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum zone_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */ /* GENHD_FL_EXT_DEVT flag is not available */ /* #undef ZFS_GENHD_FL_EXT_DEVT */ /* GENHD_FL_NO_PART_SCAN flag is available */ /* #undef ZFS_GENHD_FL_NO_PART */ /* global_node_page_state() exists */ /* #undef ZFS_GLOBAL_NODE_PAGE_STATE */ /* global_zone_page_state() exists */ /* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */ /* Define to 1 if GPL-only symbols can be used */ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.2.99-231-FreeBSD_g688514e47" +#define ZFS_META_ALIAS "zfs-2.2.99-239-FreeBSD_ga03ebd9be" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" /* Define the project release date. */ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ #define ZFS_META_KVER_MAX "6.6" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "3.10" /* Define the project license. */ #define ZFS_META_LICENSE "CDDL" /* Define the libtool library 'age' version information. */ /* #undef ZFS_META_LT_AGE */ /* Define the libtool library 'current' version information. */ /* #undef ZFS_META_LT_CURRENT */ /* Define the libtool library 'revision' version information. */ /* #undef ZFS_META_LT_REVISION */ /* Define the project name. */ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "231-FreeBSD_g688514e47" +#define ZFS_META_RELEASE "239-FreeBSD_ga03ebd9be" /* Define the project version. */ #define ZFS_META_VERSION "2.2.99" /* count is located in percpu_ref.data */ /* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */ diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index 3dc0d7e7eaa5..12c5321568cd 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.2.99-231-g688514e47" +#define ZFS_META_GITREV "zfs-2.2.99-239-ga03ebd9be"