diff --git a/sys/contrib/openzfs/include/sys/arc.h b/sys/contrib/openzfs/include/sys/arc.h index 836ed679dbac..9d67dab06ca3 100644 --- a/sys/contrib/openzfs/include/sys/arc.h +++ b/sys/contrib/openzfs/include/sys/arc.h @@ -1,350 +1,349 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. */ #ifndef _SYS_ARC_H #define _SYS_ARC_H #include #ifdef __cplusplus extern "C" { #endif #include #include #include #include /* * Used by arc_flush() to inform arc_evict_state() that it should evict * all available buffers from the arc state being passed in. */ #define ARC_EVICT_ALL UINT64_MAX /* * ZFS gets very unhappy when the maximum ARC size is smaller than the maximum * block size and a larger block is written. To leave some safety margin, we * limit the minimum for zfs_arc_max to the maximium transaction size. */ #define MIN_ARC_MAX DMU_MAX_ACCESS #define HDR_SET_LSIZE(hdr, x) do { \ ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ } while (0) #define HDR_SET_PSIZE(hdr, x) do { \ ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ } while (0) #define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) #define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef struct arc_prune arc_prune_t; /* * Because the ARC can store encrypted data, errors (not due to bugs) may arise * while transforming data into its desired format - specifically, when * decrypting, the key may not be present, or the HMAC may not be correct * which signifies deliberate tampering with the on-disk state * (assuming that the checksum was correct). If any error occurs, the "buf" * parameter will be NULL. */ typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *priv); typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); typedef void arc_prune_func_t(int64_t bytes, void *priv); /* Shared module parameters */ extern uint_t zfs_arc_average_blocksize; extern int l2arc_exclude_special; /* generic arc_done_func_t's which you can use */ arc_read_done_func_t arc_bcopy_func; arc_read_done_func_t arc_getbuf_func; /* generic arc_prune_func_t wrapper for callbacks */ struct arc_prune { arc_prune_func_t *p_pfunc; void *p_private; uint64_t p_adjust; list_node_t p_node; zfs_refcount_t p_refcnt; }; typedef enum arc_strategy { ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */ ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */ } arc_strategy_t; typedef enum arc_flags { /* * Public flags that can be passed into the ARC by external consumers. */ ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ ARC_FLAG_UNCACHED = 1 << 5, /* evict after use */ ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_hdr_buf_t. These flags should * only be set by ARC code. */ ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */ /* Indicates that block was read with ASYNC priority. */ ARC_FLAG_PRIO_ASYNC_READ = 1 << 11, ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */ ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */ ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */ /* * Encrypted or authenticated on disk (may be plaintext in memory). * This header has b_crypt_hdr allocated. Does not include indirect * blocks with checksums of MACs which will also have their X * (encrypted) bit set in the bp. */ ARC_FLAG_PROTECTED = 1 << 15, /* data has not been authenticated yet */ ARC_FLAG_NOAUTH = 1 << 16, /* indicates that the buffer contains metadata (otherwise, data) */ ARC_FLAG_BUFC_METADATA = 1 << 17, /* Flags specifying whether optional hdr struct fields are defined */ ARC_FLAG_HAS_L1HDR = 1 << 18, ARC_FLAG_HAS_L2HDR = 1 << 19, /* * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. * This allows the l2arc to use the blkptr's checksum to verify * the data without having to store the checksum in the hdr. */ ARC_FLAG_COMPRESSED_ARC = 1 << 20, ARC_FLAG_SHARED_DATA = 1 << 21, /* * Fail this arc_read() (with ENOENT) if the data is not already present * in cache. */ ARC_FLAG_CACHED_ONLY = 1 << 22, /* * Don't instantiate an arc_buf_t for arc_read_done. */ ARC_FLAG_NO_BUF = 1 << 23, /* * The arc buffer's compression mode is stored in the top 7 bits of the * flags field, so these dummy flags are included so that MDB can * interpret the enum properly. */ ARC_FLAG_COMPRESS_0 = 1 << 24, ARC_FLAG_COMPRESS_1 = 1 << 25, ARC_FLAG_COMPRESS_2 = 1 << 26, ARC_FLAG_COMPRESS_3 = 1 << 27, ARC_FLAG_COMPRESS_4 = 1 << 28, ARC_FLAG_COMPRESS_5 = 1 << 29, ARC_FLAG_COMPRESS_6 = 1 << 30 } arc_flags_t; typedef enum arc_buf_flags { ARC_BUF_FLAG_SHARED = 1 << 0, ARC_BUF_FLAG_COMPRESSED = 1 << 1, /* * indicates whether this arc_buf_t is encrypted, regardless of * state on-disk */ ARC_BUF_FLAG_ENCRYPTED = 1 << 2 } arc_buf_flags_t; struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; void *b_data; arc_buf_flags_t b_flags; }; typedef enum arc_buf_contents { ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES } arc_buf_contents_t; /* * The following breakdowns of arc_size exist for kstat only. */ typedef enum arc_space_type { ARC_SPACE_DATA, ARC_SPACE_META, ARC_SPACE_HDRS, ARC_SPACE_L2HDRS, ARC_SPACE_DBUF, ARC_SPACE_DNODE, ARC_SPACE_BONUS, ARC_SPACE_ABD_CHUNK_WASTE, ARC_SPACE_NUMTYPES } arc_space_type_t; typedef enum arc_state_type { ARC_STATE_ANON, ARC_STATE_MRU, ARC_STATE_MRU_GHOST, ARC_STATE_MFU, ARC_STATE_MFU_GHOST, ARC_STATE_L2C_ONLY, ARC_STATE_UNCACHED, ARC_STATE_NUMTYPES } arc_state_type_t; typedef struct arc_buf_info { arc_state_type_t abi_state_type; arc_buf_contents_t abi_state_contents; uint32_t abi_flags; uint32_t abi_bufcnt; uint64_t abi_size; uint64_t abi_spa; uint64_t abi_access; uint32_t abi_mru_hits; uint32_t abi_mru_ghost_hits; uint32_t abi_mfu_hits; uint32_t abi_mfu_ghost_hits; uint32_t abi_l2arc_hits; uint32_t abi_holds; uint64_t abi_l2arc_dattr; uint64_t abi_l2arc_asize; enum zio_compress abi_l2arc_compress; } arc_buf_info_t; void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); boolean_t arc_is_metadata(arc_buf_t *buf); boolean_t arc_is_encrypted(arc_buf_t *buf); boolean_t arc_is_unauthenticated(arc_buf_t *buf); enum zio_compress arc_get_compression(arc_buf_t *buf); void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, uint8_t *iv, uint8_t *mac); int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, boolean_t in_place); void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac); arc_buf_t *arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type, int32_t size); arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); arc_buf_t *arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); uint8_t arc_get_complevel(arc_buf_t *buf); arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size); arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); arc_buf_t *arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); void arc_return_buf(arc_buf_t *buf, const void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, const void *tag); void arc_buf_destroy(arc_buf_t *buf, const void *tag); void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); uint64_t arc_buf_size(arc_buf_t *buf); uint64_t arc_buf_lsize(arc_buf_t *buf); void arc_buf_access(arc_buf_t *buf); void arc_release(arc_buf_t *buf, const void *tag); int arc_released(arc_buf_t *buf); void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #else #define arc_referenced(buf) ((void) sizeof (buf), 0) #endif int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *priv, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, - arc_write_done_func_t *physdone, arc_write_done_func_t *done, - void *priv, zio_priority_t priority, int zio_flags, - const zbookmark_phys_t *zb); + arc_write_done_func_t *done, void *priv, zio_priority_t priority, + int zio_flags, const zbookmark_phys_t *zb); arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); void arc_remove_prune_callback(arc_prune_t *p); void arc_freed(spa_t *spa, const blkptr_t *bp); void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); uint64_t arc_all_memory(void); uint64_t arc_default_max(uint64_t min, uint64_t allmem); uint64_t arc_target_bytes(void); void arc_set_limits(uint64_t); void arc_init(void); void arc_fini(void); /* * Level 2 ARC */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check); void l2arc_init(void); void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); void l2arc_spa_rebuild_start(spa_t *spa); #ifndef _KERNEL extern boolean_t arc_watch; #endif #ifdef __cplusplus } #endif #endif /* _SYS_ARC_H */ diff --git a/sys/contrib/openzfs/include/sys/arc_impl.h b/sys/contrib/openzfs/include/sys/arc_impl.h index fd24d2f3c8bd..78774792f367 100644 --- a/sys/contrib/openzfs/include/sys/arc_impl.h +++ b/sys/contrib/openzfs/include/sys/arc_impl.h @@ -1,1098 +1,1097 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, Delphix. All rights reserved. * Copyright (c) 2013, Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. */ #ifndef _SYS_ARC_IMPL_H #define _SYS_ARC_IMPL_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recently used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache * ARC_uncached - uncacheable prefetch, to be evicted * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are * the only buffers that can be evicted or deleted. Within each * state there are multiple lists, one for meta-data and one for * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, * etc.) is tracked separately so that it can be managed more * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will acquire a DVA * as they are written and migrate onto the arc_mru list. * * The ARC_l2c_only state is for buffers that are in the second * level ARC but no longer in any of the ARC_m* lists. The second * level ARC itself may also contain buffers that are in any of * the ARC_m* states - meaning that a buffer can exist in two * places. The reason for the ARC_l2c_only state is to keep the * buffer header in the hash table, so that reads that hit the * second level ARC benefit from these fast lookups. */ typedef struct arc_state { /* * list of evictable buffers */ multilist_t arcs_list[ARC_BUFC_NUMTYPES]; /* * supports the "dbufs" kstat */ arc_state_type_t arcs_state; /* * total amount of data in this state. */ zfs_refcount_t arcs_size[ARC_BUFC_NUMTYPES] ____cacheline_aligned; /* * total amount of evictable data in this state */ zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * amount of hit bytes for this state (counted only for ghost states) */ wmsum_t arcs_hits[ARC_BUFC_NUMTYPES]; } arc_state_t; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_read_done_func_t *acb_done; arc_buf_t *acb_buf; boolean_t acb_encrypted; boolean_t acb_compressed; boolean_t acb_noauth; boolean_t acb_nobuf; boolean_t acb_wait; int acb_wait_error; kmutex_t acb_wait_lock; kcondvar_t acb_wait_cv; zbookmark_phys_t acb_zb; zio_t *acb_zio_dummy; zio_t *acb_zio_head; arc_callback_t *acb_prev; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { void *awcb_private; arc_write_done_func_t *awcb_ready; arc_write_done_func_t *awcb_children_ready; - arc_write_done_func_t *awcb_physdone; arc_write_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; /* * ARC buffers are separated into multiple structs as a memory saving measure: * - Common fields struct, always defined, and embedded within it: * - L2-only fields, always allocated but undefined when not in L2ARC * - L1-only fields, only allocated when in L1ARC * * Buffer in L1 Buffer only in L2 * +------------------------+ +------------------------+ * | arc_buf_hdr_t | | arc_buf_hdr_t | * | | | | * | | | | * | | | | * +------------------------+ +------------------------+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | * | (undefined if L1-only) | | | * +------------------------+ +------------------------+ * | l1arc_buf_hdr_t | * | | * | | * | | * | | * +------------------------+ * * Because it's possible for the L2ARC to become extremely large, we can wind * up eating a lot of memory in L2ARC buffer headers, so the size of a header * is minimized by only allocating the fields necessary for an L1-cached buffer * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple * words in pointers. arc_hdr_realloc() is used to switch a header between * these two allocation states. */ typedef struct l1arc_buf_hdr { /* for waiting on reads to complete */ kcondvar_t b_cv; uint8_t b_byteswap; /* protected by arc state mutex */ arc_state_t *b_state; multilist_node_t b_arc_node; /* protected by hash lock */ clock_t b_arc_access; uint32_t b_mru_hits; uint32_t b_mru_ghost_hits; uint32_t b_mfu_hits; uint32_t b_mfu_ghost_hits; uint32_t b_bufcnt; arc_buf_t *b_buf; /* self protecting */ zfs_refcount_t b_refcnt; arc_callback_t *b_acb; abd_t *b_pabd; #ifdef ZFS_DEBUG zio_cksum_t *b_freeze_cksum; kmutex_t b_freeze_lock; #endif } l1arc_buf_hdr_t; typedef enum l2arc_dev_hdr_flags_t { L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ } l2arc_dev_hdr_flags_t; /* * Pointer used in persistent L2ARC (for pointing to log blocks). */ typedef struct l2arc_log_blkptr { /* * Offset of log block within the device, in bytes */ uint64_t lbp_daddr; /* * Aligned payload size (in bytes) of the log block */ uint64_t lbp_payload_asize; /* * Offset in bytes of the first buffer in the payload */ uint64_t lbp_payload_start; /* * lbp_prop has the following format: * * logical size (in bytes) * * aligned (after compression) size (in bytes) * * compression algorithm (we always LZ4-compress l2arc logs) * * checksum algorithm (used for lbp_cksum) */ uint64_t lbp_prop; zio_cksum_t lbp_cksum; /* checksum of log */ } l2arc_log_blkptr_t; /* * The persistent L2ARC device header. * Byte order of magic determines whether 64-bit bswap of fields is necessary. */ typedef struct l2arc_dev_hdr_phys { uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ uint64_t dh_version; /* Persistent L2ARC version */ /* * Global L2ARC device state and metadata. */ uint64_t dh_spa_guid; uint64_t dh_vdev_guid; uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ uint64_t dh_evict; /* evicted offset in bytes */ uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ /* * Used in zdb.c for determining if a log block is valid, in the same * way that l2arc_rebuild() does. */ uint64_t dh_start; /* mirror of l2ad_start */ uint64_t dh_end; /* mirror of l2ad_end */ /* * Start of log block chain. [0] -> newest log, [1] -> one older (used * for initiating prefetch). */ l2arc_log_blkptr_t dh_start_lbps[2]; /* * Aligned size of all log blocks as accounted by vdev_space_update(). */ uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ /* * Mirrors of vdev_trim_action_time and vdev_trim_state, used to * display when the cache device was fully trimmed for the last * time. */ uint64_t dh_trim_action_time; uint64_t dh_trim_state; const uint64_t dh_pad[30]; /* pad to 512 bytes */ zio_eck_t dh_tail; } l2arc_dev_hdr_phys_t; _Static_assert(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE, "l2arc_dev_hdr_phys_t wrong size"); /* * A single ARC buffer header entry in a l2arc_log_blk_phys_t. */ typedef struct l2arc_log_ent_phys { dva_t le_dva; /* dva of buffer */ uint64_t le_birth; /* birth txg of buffer */ /* * le_prop has the following format: * * logical size (in bytes) * * physical (compressed) size (in bytes) * * compression algorithm * * object type (used to restore arc_buf_contents_t) * * protected status (used for encryption) * * prefetch status (used in l2arc_read_done()) */ uint64_t le_prop; uint64_t le_daddr; /* buf location on l2dev */ uint64_t le_complevel; /* * We pad the size of each entry to a power of 2 so that the size of * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, * because of the L2ARC_SET_*SIZE macros. */ const uint64_t le_pad[2]; /* pad to 64 bytes */ } l2arc_log_ent_phys_t; #define L2ARC_LOG_BLK_MAX_ENTRIES (1022) /* * A log block of up to 1022 ARC buffer log entries, chained into the * persistent L2ARC metadata linked list. Byte order of magic determines * whether 64-bit bswap of fields is necessary. */ typedef struct l2arc_log_blk_phys { uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ /* * There are 2 chains (headed by dh_start_lbps[2]), and this field * points back to the previous block in this chain. We alternate * which chain we append to, so they are time-wise and offset-wise * interleaved, but that is an optimization rather than for * correctness. */ l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ /* * Pad header section to 128 bytes */ uint64_t lb_pad[7]; /* Payload */ l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; } l2arc_log_blk_phys_t; /* 64K total */ /* * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. */ _Static_assert(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), 1ULL << SPA_MINBLOCKSHIFT), "l2arc_log_blk_phys_t misaligned"); _Static_assert(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE, "l2arc_log_blk_phys_t too small"); _Static_assert(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE, "l2arc_log_blk_phys_t too big"); /* * These structures hold in-flight abd buffers for log blocks as they're being * written to the L2ARC device. */ typedef struct l2arc_lb_abd_buf { abd_t *abd; list_node_t node; } l2arc_lb_abd_buf_t; /* * These structures hold pointers to log blocks present on the L2ARC device. */ typedef struct l2arc_lb_ptr_buf { l2arc_log_blkptr_t *lb_ptr; list_node_t node; } l2arc_lb_ptr_buf_t; /* Macros for setting fields in le_prop and lbp_prop */ #define L2BLK_GET_LSIZE(field) \ BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) #define L2BLK_SET_LSIZE(field, x) \ BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) #define L2BLK_GET_PSIZE(field) \ BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) #define L2BLK_SET_PSIZE(field, x) \ BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) #define L2BLK_GET_COMPRESS(field) \ BF64_GET((field), 32, SPA_COMPRESSBITS) #define L2BLK_SET_COMPRESS(field, x) \ BF64_SET((field), 32, SPA_COMPRESSBITS, x) #define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) /* +/- 1 here are to keep compatibility after ARC_BUFC_INVALID removal. */ #define L2BLK_GET_TYPE(field) (BF64_GET((field), 48, 8) - 1) #define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, (x) + 1) #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) #define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) #define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x) #define PTR_SWAP(x, y) \ do { \ void *tmp = (x);\ x = y; \ y = tmp; \ } while (0) #define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ #define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ /* * L2ARC Internals */ typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ kmutex_t l2ad_mtx; /* lock for buffer list */ list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ zfs_refcount_t l2ad_alloc; /* allocated bytes */ /* * Persistence-related stuff */ l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ int l2ad_log_ent_idx; /* index into cur log blk */ /* Number of bytes in current log block's payload */ uint64_t l2ad_log_blk_payload_asize; /* * Offset (in bytes) of the first buffer in current log block's * payload. */ uint64_t l2ad_log_blk_payload_start; /* Flag indicating whether a rebuild is scheduled or is going on */ boolean_t l2ad_rebuild; boolean_t l2ad_rebuild_cancel; boolean_t l2ad_rebuild_began; uint64_t l2ad_log_entries; /* entries per log blk */ uint64_t l2ad_evict; /* evicted offset in bytes */ /* List of pointers to log blocks present in the L2ARC device */ list_t l2ad_lbptr_list; /* * Aligned size of all log blocks as accounted by vdev_space_update(). */ zfs_refcount_t l2ad_lb_asize; /* * Number of log blocks present on the device. */ zfs_refcount_t l2ad_lb_count; boolean_t l2ad_trim_all; /* TRIM whole device */ } l2arc_dev_t; /* * Encrypted blocks will need to be stored encrypted on the L2ARC * disk as they appear in the main pool. In order for this to work we * need to pass around the encryption parameters so they can be used * to write data to the L2ARC. This struct is only defined in the * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED * flag set. */ typedef struct arc_buf_hdr_crypt { abd_t *b_rabd; /* raw encrypted data */ dmu_object_type_t b_ot; /* object type */ uint32_t b_ebufcnt; /* count of encrypted buffers */ /* dsobj for looking up encryption key for l2arc encryption */ uint64_t b_dsobj; /* encryption parameters */ uint8_t b_salt[ZIO_DATA_SALT_LEN]; uint8_t b_iv[ZIO_DATA_IV_LEN]; /* * Technically this could be removed since we will always be able to * get the mac from the bp when we need it. However, it is inconvenient * for callers of arc code to have to pass a bp in all the time. This * also allows us to assert that L2ARC data is properly encrypted to * match the data in the main storage pool. */ uint8_t b_mac[ZIO_DATA_MAC_LEN]; } arc_buf_hdr_crypt_t; typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ uint32_t b_hits; arc_state_type_t b_arcs_state; list_node_t b_l2node; } l2arc_buf_hdr_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ /* in-flight list of log blocks */ list_t l2wcb_abd_list; } l2arc_write_callback_t; struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; arc_buf_contents_t b_type; uint8_t b_complevel; uint8_t b_reserved1; /* used for 4 byte alignment */ uint16_t b_reserved2; /* used for 4 byte alignment */ arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; /* * This field stores the size of the data buffer after * compression, and is set in the arc's zio completion handlers. * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). * * While the block pointers can store up to 32MB in their psize * field, we can only store up to 32MB minus 512B. This is due * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. * a field of zeros represents 512B in the bp). We can't use a * bias of 1 since we need to reserve a psize of zero, here, to * represent holes and embedded blocks. * * This isn't a problem in practice, since the maximum size of a * buffer is limited to 16MB, so we never need to store 32MB in * this field. Even in the upstream illumos code base, the * maximum size of a buffer is limited to 16MB. */ uint16_t b_psize; /* * This field stores the size of the data buffer before * compression, and cannot change once set. It is in units * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) */ uint16_t b_lsize; /* immutable */ uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; /* L1ARC fields. Undefined when in l2arc_only state */ l1arc_buf_hdr_t b_l1hdr; /* * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED * is set and the L1 header exists. */ arc_buf_hdr_crypt_t b_crypt_hdr; }; typedef struct arc_stats { /* Number of requests that were satisfied without I/O. */ kstat_named_t arcstat_hits; /* Number of requests for which I/O was already running. */ kstat_named_t arcstat_iohits; /* Number of requests for which I/O has to be issued. */ kstat_named_t arcstat_misses; /* Same three, but specifically for demand data. */ kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_iohits; kstat_named_t arcstat_demand_data_misses; /* Same three, but specifically for demand metadata. */ kstat_named_t arcstat_demand_metadata_hits; kstat_named_t arcstat_demand_metadata_iohits; kstat_named_t arcstat_demand_metadata_misses; /* Same three, but specifically for prefetch data. */ kstat_named_t arcstat_prefetch_data_hits; kstat_named_t arcstat_prefetch_data_iohits; kstat_named_t arcstat_prefetch_data_misses; /* Same three, but specifically for prefetch metadata. */ kstat_named_t arcstat_prefetch_metadata_hits; kstat_named_t arcstat_prefetch_metadata_iohits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_uncached_hits; kstat_named_t arcstat_deleted; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held * by something using the same buffer, since hash locks are shared * by multiple buffers. */ kstat_named_t arcstat_mutex_miss; /* * Number of buffers skipped when updating the access state due to the * header having already been released after acquiring the hash lock. */ kstat_named_t arcstat_access_skip; /* * Number of buffers skipped because they have I/O in progress, are * indirect prefetch buffers that have not lived long enough, or are * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; /* * Number of times arc_evict_state() was unable to evict enough * buffers to reach its target amount. */ kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_eligible_mfu; kstat_named_t arcstat_evict_l2_eligible_mru; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; kstat_named_t arcstat_meta; kstat_named_t arcstat_pd; kstat_named_t arcstat_pm; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. * Note that the compressed bytes may match the uncompressed bytes * if the block is either not compressed or compressed arc is disabled. */ kstat_named_t arcstat_compressed_size; /* * Uncompressed size of the data stored in b_pabd. If compressed * arc is disabled then this value will be identical to the stat * above. */ kstat_named_t arcstat_uncompressed_size; /* * Number of bytes stored in all the arc_buf_t's. This is classified * as "overhead" since this data is typically short-lived and will * be evicted from the arc when it becomes unreferenced unless the * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level * values have been set (see comment in dbuf.c for more information). */ kstat_named_t arcstat_overhead_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually * backed by ARC buffers. This includes arc_buf_hdr_t structures * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only * caches), and arc_buf_t structures (allocated via arc_buf_t * cache). */ kstat_named_t arcstat_hdr_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_DATA. This is generally consumed by buffers backing * on disk user data (e.g. plain file contents). */ kstat_named_t arcstat_data_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_METADATA. This is generally consumed by buffers * backing on disk data that is used for internal ZFS * structures (e.g. ZAP, dnode, indirect blocks, etc). */ kstat_named_t arcstat_metadata_size; /* * Number of bytes consumed by dmu_buf_impl_t objects. */ kstat_named_t arcstat_dbuf_size; /* * Number of bytes consumed by dnode_t objects. */ kstat_named_t arcstat_dnode_size; /* * Number of bytes consumed by bonus buffers. */ kstat_named_t arcstat_bonus_size; #if defined(COMPAT_FREEBSD11) /* * Sum of the previous three counters, provided for compatibility. */ kstat_named_t arcstat_other_size; #endif /* * Total number of bytes consumed by ARC buffers residing in the * arc_anon state. This includes *all* buffers in the arc_anon * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_anon_size; kstat_named_t arcstat_anon_data; kstat_named_t arcstat_anon_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mru state. This includes *all* buffers in the arc_mru * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mru_size; kstat_named_t arcstat_mru_data; kstat_named_t arcstat_mru_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mru_ghost state. The key thing to note * here, is the fact that this size doesn't actually indicate * RAM consumption. The ghost lists only consist of headers and * don't actually have ARC buffers linked off of these headers. * Thus, *if* the headers had associated ARC buffers, these * buffers *would have* consumed this number of bytes. */ kstat_named_t arcstat_mru_ghost_size; kstat_named_t arcstat_mru_ghost_data; kstat_named_t arcstat_mru_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mfu state. This includes *all* buffers in the arc_mfu * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mfu_size; kstat_named_t arcstat_mfu_data; kstat_named_t arcstat_mfu_metadata; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu * state. */ kstat_named_t arcstat_mfu_evictable_data; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_METADATA, and reside in the * arc_mfu state. */ kstat_named_t arcstat_mfu_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mfu_ghost state. See the comment above * arcstat_mru_ghost_size for more details. */ kstat_named_t arcstat_mfu_ghost_size; kstat_named_t arcstat_mfu_ghost_data; kstat_named_t arcstat_mfu_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_metadata; /* * Total number of bytes that are going to be evicted from ARC due to * ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_size; kstat_named_t arcstat_uncached_data; kstat_named_t arcstat_uncached_metadata; /* * Number of data bytes that are going to be evicted from ARC due to * ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_evictable_data; /* * Number of metadata bytes that that are going to be evicted from ARC * due to ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; /* * Allocated size (in bytes) of L2ARC cached buffers by ARC state. */ kstat_named_t arcstat_l2_prefetch_asize; kstat_named_t arcstat_l2_mru_asize; kstat_named_t arcstat_l2_mfu_asize; /* * Allocated size (in bytes) of L2ARC cached buffers by buffer content * type. */ kstat_named_t arcstat_l2_bufc_data_asize; kstat_named_t arcstat_l2_bufc_metadata_asize; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_lsize; kstat_named_t arcstat_l2_psize; kstat_named_t arcstat_l2_hdr_size; /* * Number of L2ARC log blocks written. These are used for restoring the * L2ARC. Updated during writing of L2ARC log blocks. */ kstat_named_t arcstat_l2_log_blk_writes; /* * Moving average of the aligned size of the L2ARC log blocks, in * bytes. Updated during L2ARC rebuild and during writing of L2ARC * log blocks. */ kstat_named_t arcstat_l2_log_blk_avg_asize; /* Aligned size of L2ARC log blocks on L2ARC devices. */ kstat_named_t arcstat_l2_log_blk_asize; /* Number of L2ARC log blocks present on L2ARC devices. */ kstat_named_t arcstat_l2_log_blk_count; /* * Moving average of the aligned size of L2ARC restored data, in bytes, * to the aligned size of their metadata in L2ARC, in bytes. * Updated during L2ARC rebuild and during writing of L2ARC log blocks. */ kstat_named_t arcstat_l2_data_to_meta_ratio; /* * Number of times the L2ARC rebuild was successful for an L2ARC device. */ kstat_named_t arcstat_l2_rebuild_success; /* * Number of times the L2ARC rebuild failed because the device header * was in an unsupported format or corrupted. */ kstat_named_t arcstat_l2_rebuild_abort_unsupported; /* * Number of times the L2ARC rebuild failed because of IO errors * while reading a log block. */ kstat_named_t arcstat_l2_rebuild_abort_io_errors; /* * Number of times the L2ARC rebuild failed because of IO errors when * reading the device header. */ kstat_named_t arcstat_l2_rebuild_abort_dh_errors; /* * Number of L2ARC log blocks which failed to be restored due to * checksum errors. */ kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; /* * Number of times the L2ARC rebuild was aborted due to low system * memory. */ kstat_named_t arcstat_l2_rebuild_abort_lowmem; /* Logical size of L2ARC restored data, in bytes. */ kstat_named_t arcstat_l2_rebuild_size; /* Aligned size of L2ARC restored data, in bytes. */ kstat_named_t arcstat_l2_rebuild_asize; /* * Number of L2ARC log entries (buffers) that were successfully * restored in ARC. */ kstat_named_t arcstat_l2_rebuild_bufs; /* * Number of L2ARC log entries (buffers) already cached in ARC. These * were not restored again. */ kstat_named_t arcstat_l2_rebuild_bufs_precached; /* * Number of L2ARC log blocks that were restored successfully. Each * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. */ kstat_named_t arcstat_l2_rebuild_log_blks; kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_memory_direct_count; kstat_named_t arcstat_memory_indirect_count; kstat_named_t arcstat_memory_all_bytes; kstat_named_t arcstat_memory_free_bytes; kstat_named_t arcstat_memory_available_bytes; kstat_named_t arcstat_no_grow; kstat_named_t arcstat_tempreserve; kstat_named_t arcstat_loaned_bytes; kstat_named_t arcstat_prune; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_dnode_limit; kstat_named_t arcstat_async_upgrade_sync; /* Number of predictive prefetch requests. */ kstat_named_t arcstat_predictive_prefetch; /* Number of requests for which predictive prefetch has completed. */ kstat_named_t arcstat_demand_hit_predictive_prefetch; /* Number of requests for which predictive prefetch was running. */ kstat_named_t arcstat_demand_iohit_predictive_prefetch; /* Number of prescient prefetch requests. */ kstat_named_t arcstat_prescient_prefetch; /* Number of requests for which prescient prefetch has completed. */ kstat_named_t arcstat_demand_hit_prescient_prefetch; /* Number of requests for which prescient prefetch was running. */ kstat_named_t arcstat_demand_iohit_prescient_prefetch; kstat_named_t arcstat_need_free; kstat_named_t arcstat_sys_free; kstat_named_t arcstat_raw_size; kstat_named_t arcstat_cached_only_in_progress; kstat_named_t arcstat_abd_chunk_waste_size; } arc_stats_t; typedef struct arc_sums { wmsum_t arcstat_hits; wmsum_t arcstat_iohits; wmsum_t arcstat_misses; wmsum_t arcstat_demand_data_hits; wmsum_t arcstat_demand_data_iohits; wmsum_t arcstat_demand_data_misses; wmsum_t arcstat_demand_metadata_hits; wmsum_t arcstat_demand_metadata_iohits; wmsum_t arcstat_demand_metadata_misses; wmsum_t arcstat_prefetch_data_hits; wmsum_t arcstat_prefetch_data_iohits; wmsum_t arcstat_prefetch_data_misses; wmsum_t arcstat_prefetch_metadata_hits; wmsum_t arcstat_prefetch_metadata_iohits; wmsum_t arcstat_prefetch_metadata_misses; wmsum_t arcstat_mru_hits; wmsum_t arcstat_mru_ghost_hits; wmsum_t arcstat_mfu_hits; wmsum_t arcstat_mfu_ghost_hits; wmsum_t arcstat_uncached_hits; wmsum_t arcstat_deleted; wmsum_t arcstat_mutex_miss; wmsum_t arcstat_access_skip; wmsum_t arcstat_evict_skip; wmsum_t arcstat_evict_not_enough; wmsum_t arcstat_evict_l2_cached; wmsum_t arcstat_evict_l2_eligible; wmsum_t arcstat_evict_l2_eligible_mfu; wmsum_t arcstat_evict_l2_eligible_mru; wmsum_t arcstat_evict_l2_ineligible; wmsum_t arcstat_evict_l2_skip; wmsum_t arcstat_hash_collisions; wmsum_t arcstat_hash_chains; aggsum_t arcstat_size; wmsum_t arcstat_compressed_size; wmsum_t arcstat_uncompressed_size; wmsum_t arcstat_overhead_size; wmsum_t arcstat_hdr_size; wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; wmsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; wmsum_t arcstat_l2_prefetch_asize; wmsum_t arcstat_l2_mru_asize; wmsum_t arcstat_l2_mfu_asize; wmsum_t arcstat_l2_bufc_data_asize; wmsum_t arcstat_l2_bufc_metadata_asize; wmsum_t arcstat_l2_feeds; wmsum_t arcstat_l2_rw_clash; wmsum_t arcstat_l2_read_bytes; wmsum_t arcstat_l2_write_bytes; wmsum_t arcstat_l2_writes_sent; wmsum_t arcstat_l2_writes_done; wmsum_t arcstat_l2_writes_error; wmsum_t arcstat_l2_writes_lock_retry; wmsum_t arcstat_l2_evict_lock_retry; wmsum_t arcstat_l2_evict_reading; wmsum_t arcstat_l2_evict_l1cached; wmsum_t arcstat_l2_free_on_write; wmsum_t arcstat_l2_abort_lowmem; wmsum_t arcstat_l2_cksum_bad; wmsum_t arcstat_l2_io_error; wmsum_t arcstat_l2_lsize; wmsum_t arcstat_l2_psize; aggsum_t arcstat_l2_hdr_size; wmsum_t arcstat_l2_log_blk_writes; wmsum_t arcstat_l2_log_blk_asize; wmsum_t arcstat_l2_log_blk_count; wmsum_t arcstat_l2_rebuild_success; wmsum_t arcstat_l2_rebuild_abort_unsupported; wmsum_t arcstat_l2_rebuild_abort_io_errors; wmsum_t arcstat_l2_rebuild_abort_dh_errors; wmsum_t arcstat_l2_rebuild_abort_cksum_lb_errors; wmsum_t arcstat_l2_rebuild_abort_lowmem; wmsum_t arcstat_l2_rebuild_size; wmsum_t arcstat_l2_rebuild_asize; wmsum_t arcstat_l2_rebuild_bufs; wmsum_t arcstat_l2_rebuild_bufs_precached; wmsum_t arcstat_l2_rebuild_log_blks; wmsum_t arcstat_memory_throttle_count; wmsum_t arcstat_memory_direct_count; wmsum_t arcstat_memory_indirect_count; wmsum_t arcstat_prune; wmsum_t arcstat_meta_used; wmsum_t arcstat_async_upgrade_sync; wmsum_t arcstat_predictive_prefetch; wmsum_t arcstat_demand_hit_predictive_prefetch; wmsum_t arcstat_demand_iohit_predictive_prefetch; wmsum_t arcstat_prescient_prefetch; wmsum_t arcstat_demand_hit_prescient_prefetch; wmsum_t arcstat_demand_iohit_prescient_prefetch; wmsum_t arcstat_raw_size; wmsum_t arcstat_cached_only_in_progress; wmsum_t arcstat_abd_chunk_waste_size; } arc_sums_t; typedef struct arc_evict_waiter { list_node_t aew_node; kcondvar_t aew_cv; uint64_t aew_count; } arc_evict_waiter_t; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ wmsum_add(&arc_sums.stat, (val)) #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ #define arc_meta ARCSTAT(arcstat_meta) /* target frac of metadata */ #define arc_pd ARCSTAT(arcstat_pd) /* target frac of data MRU */ #define arc_pm ARCSTAT(arcstat_pm) /* target frac of meta MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ #define arc_anon (&ARC_anon) #define arc_mru (&ARC_mru) #define arc_mru_ghost (&ARC_mru_ghost) #define arc_mfu (&ARC_mfu) #define arc_mfu_ghost (&ARC_mfu_ghost) #define arc_l2c_only (&ARC_l2c_only) #define arc_uncached (&ARC_uncached) extern taskq_t *arc_prune_taskq; extern arc_stats_t arc_stats; extern arc_sums_t arc_sums; extern hrtime_t arc_growtime; extern boolean_t arc_warm; extern uint_t arc_grow_retry; extern uint_t arc_no_grow_shift; extern uint_t arc_shrink_shift; extern kmutex_t arc_prune_mtx; extern list_t arc_prune_list; extern arc_state_t ARC_mfu; extern arc_state_t ARC_mru; extern uint_t zfs_arc_pc_percent; extern uint_t arc_lotsfree_percent; extern uint64_t zfs_arc_min; extern uint64_t zfs_arc_max; extern void arc_reduce_target_size(int64_t to_free); extern boolean_t arc_reclaim_needed(void); extern void arc_kmem_reap_soon(void); extern void arc_wait_for_eviction(uint64_t, boolean_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); extern void arc_prune_async(uint64_t); extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); extern uint64_t arc_free_memory(void); extern int64_t arc_available_memory(void); extern void arc_tuning_update(boolean_t); extern void arc_register_hotplug(void); extern void arc_unregister_hotplug(void); extern int param_set_arc_u64(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS); /* used in zdb.c */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); /* used in vdev_trim.c */ void l2arc_dev_hdr_update(l2arc_dev_t *dev); l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); #ifdef __cplusplus } #endif #endif /* _SYS_ARC_IMPL_H */ diff --git a/sys/contrib/openzfs/include/sys/zfs_refcount.h b/sys/contrib/openzfs/include/sys/zfs_refcount.h index 4efa266a53c5..77965a0aa580 100644 --- a/sys/contrib/openzfs/include/sys/zfs_refcount.h +++ b/sys/contrib/openzfs/include/sys/zfs_refcount.h @@ -1,140 +1,144 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_REFCOUNT_H #define _SYS_ZFS_REFCOUNT_H #include +#include #include #include #ifdef __cplusplus extern "C" { #endif /* * If the reference is held only by the calling function and not any * particular object, use FTAG (which is a string) for the holder_tag. * Otherwise, use the object that holds the reference. */ #define FTAG ((char *)(uintptr_t)__func__) #ifdef ZFS_DEBUG typedef struct reference { - list_node_t ref_link; + union { + avl_node_t a; + list_node_t l; + } ref_link; const void *ref_holder; uint64_t ref_number; - uint8_t *ref_removed; + boolean_t ref_search; } reference_t; typedef struct refcount { + uint64_t rc_count; kmutex_t rc_mtx; - boolean_t rc_tracked; - list_t rc_list; + avl_tree_t rc_tree; list_t rc_removed; - uint64_t rc_count; - uint64_t rc_removed_count; + uint_t rc_removed_count; + boolean_t rc_tracked; } zfs_refcount_t; /* * Note: zfs_refcount_t must be initialized with * refcount_create[_untracked]() */ void zfs_refcount_create(zfs_refcount_t *); void zfs_refcount_create_untracked(zfs_refcount_t *); void zfs_refcount_create_tracked(zfs_refcount_t *); void zfs_refcount_destroy(zfs_refcount_t *); void zfs_refcount_destroy_many(zfs_refcount_t *, uint64_t); int zfs_refcount_is_zero(zfs_refcount_t *); int64_t zfs_refcount_count(zfs_refcount_t *); int64_t zfs_refcount_add(zfs_refcount_t *, const void *); int64_t zfs_refcount_remove(zfs_refcount_t *, const void *); /* * Note that (add|remove)_many adds/removes one reference with "number" N, * _not_ N references with "number" 1, which is what (add|remove)_few does, * or what vanilla zfs_refcount_(add|remove) called N times would do. * * Attempting to remove a reference with number N when none exists is a * panic on debug kernels with reference_tracking enabled. */ void zfs_refcount_add_few(zfs_refcount_t *, uint64_t, const void *); void zfs_refcount_remove_few(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, const void *); void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *); void zfs_refcount_transfer_ownership(zfs_refcount_t *, const void *, const void *); void zfs_refcount_transfer_ownership_many(zfs_refcount_t *, uint64_t, const void *, const void *); boolean_t zfs_refcount_held(zfs_refcount_t *, const void *); boolean_t zfs_refcount_not_held(zfs_refcount_t *, const void *); void zfs_refcount_init(void); void zfs_refcount_fini(void); #else /* ZFS_DEBUG */ typedef struct refcount { uint64_t rc_count; } zfs_refcount_t; #define zfs_refcount_create(rc) ((rc)->rc_count = 0) #define zfs_refcount_create_untracked(rc) ((rc)->rc_count = 0) #define zfs_refcount_create_tracked(rc) ((rc)->rc_count = 0) #define zfs_refcount_destroy(rc) ((rc)->rc_count = 0) #define zfs_refcount_destroy_many(rc, number) ((rc)->rc_count = 0) #define zfs_refcount_is_zero(rc) (zfs_refcount_count(rc) == 0) #define zfs_refcount_count(rc) atomic_load_64(&(rc)->rc_count) #define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count) #define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count) #define zfs_refcount_add_few(rc, number, holder) \ atomic_add_64(&(rc)->rc_count, number) #define zfs_refcount_remove_few(rc, number, holder) \ atomic_add_64(&(rc)->rc_count, -number) #define zfs_refcount_add_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, number) #define zfs_refcount_remove_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, -number) #define zfs_refcount_transfer(dst, src) { \ uint64_t __tmp = zfs_refcount_count(src); \ atomic_add_64(&(src)->rc_count, -__tmp); \ atomic_add_64(&(dst)->rc_count, __tmp); \ } #define zfs_refcount_transfer_ownership(rc, ch, nh) ((void)0) #define zfs_refcount_transfer_ownership_many(rc, nr, ch, nh) ((void)0) #define zfs_refcount_held(rc, holder) (zfs_refcount_count(rc) > 0) #define zfs_refcount_not_held(rc, holder) (B_TRUE) #define zfs_refcount_init() #define zfs_refcount_fini() #endif /* ZFS_DEBUG */ #ifdef __cplusplus } #endif #endif /* _SYS_REFCOUNT_H */ diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index 6b1352a72b9a..ec32211f6906 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -1,720 +1,715 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019-2020, Michael Niewöhner */ #ifndef _ZIO_H #define _ZIO_H #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Embedded checksum */ #define ZEC_MAGIC 0x210da7ab10c7a11ULL typedef struct zio_eck { uint64_t zec_magic; /* for validation, endianness */ zio_cksum_t zec_cksum; /* 256-bit checksum */ } zio_eck_t; /* * Gang block headers are self-checksumming and contain an array * of block pointers. */ #define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE #define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t)) / sizeof (blkptr_t)) #define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t) - \ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) typedef struct zio_gbh { blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; uint64_t zg_filler[SPA_GBH_FILLER]; zio_eck_t zg_tail; } zio_gbh_phys_t; enum zio_checksum { ZIO_CHECKSUM_INHERIT = 0, ZIO_CHECKSUM_ON, ZIO_CHECKSUM_OFF, ZIO_CHECKSUM_LABEL, ZIO_CHECKSUM_GANG_HEADER, ZIO_CHECKSUM_ZILOG, ZIO_CHECKSUM_FLETCHER_2, ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, ZIO_CHECKSUM_ZILOG2, ZIO_CHECKSUM_NOPARITY, ZIO_CHECKSUM_SHA512, ZIO_CHECKSUM_SKEIN, ZIO_CHECKSUM_EDONR, ZIO_CHECKSUM_BLAKE3, ZIO_CHECKSUM_FUNCTIONS }; /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2 #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON #define ZIO_CHECKSUM_MASK 0xffULL #define ZIO_CHECKSUM_VERIFY (1U << 8) #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 /* macros defining encryption lengths */ #define ZIO_OBJSET_MAC_LEN 32 #define ZIO_DATA_IV_LEN 12 #define ZIO_DATA_SALT_LEN 8 #define ZIO_DATA_MAC_LEN 16 /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4 /* * The meaning of "compress = on" selected by the compression features enabled * on a given pool. */ #define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4 #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_ON #define BOOTFS_COMPRESS_VALID(compress) \ ((compress) == ZIO_COMPRESS_LZJB || \ (compress) == ZIO_COMPRESS_LZ4 || \ (compress) == ZIO_COMPRESS_GZIP_1 || \ (compress) == ZIO_COMPRESS_GZIP_2 || \ (compress) == ZIO_COMPRESS_GZIP_3 || \ (compress) == ZIO_COMPRESS_GZIP_4 || \ (compress) == ZIO_COMPRESS_GZIP_5 || \ (compress) == ZIO_COMPRESS_GZIP_6 || \ (compress) == ZIO_COMPRESS_GZIP_7 || \ (compress) == ZIO_COMPRESS_GZIP_8 || \ (compress) == ZIO_COMPRESS_GZIP_9 || \ (compress) == ZIO_COMPRESS_ZLE || \ (compress) == ZIO_COMPRESS_ZSTD || \ (compress) == ZIO_COMPRESS_ON || \ (compress) == ZIO_COMPRESS_OFF) #define ZIO_COMPRESS_ALGO(x) (x & SPA_COMPRESSMASK) #define ZIO_COMPRESS_LEVEL(x) ((x & ~SPA_COMPRESSMASK) >> SPA_COMPRESSBITS) #define ZIO_COMPRESS_RAW(type, level) (type | ((level) << SPA_COMPRESSBITS)) #define ZIO_COMPLEVEL_ZSTD(level) \ ZIO_COMPRESS_RAW(ZIO_COMPRESS_ZSTD, level) #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 typedef enum zio_suspend_reason { ZIO_SUSPEND_NONE = 0, ZIO_SUSPEND_IOERR, ZIO_SUSPEND_MMP, } zio_suspend_reason_t; /* * This was originally an enum type. However, those are 32-bit and there is no * way to make a 64-bit enum type. Since we ran out of bits for flags, we were * forced to upgrade it to a uint64_t. */ typedef uint64_t zio_flag_t; /* * Flags inherited by gang, ddt, and vdev children, * and that must be equal for two zios to aggregate */ #define ZIO_FLAG_DONT_AGGREGATE (1ULL << 0) #define ZIO_FLAG_IO_REPAIR (1ULL << 1) #define ZIO_FLAG_SELF_HEAL (1ULL << 2) #define ZIO_FLAG_RESILVER (1ULL << 3) #define ZIO_FLAG_SCRUB (1ULL << 4) #define ZIO_FLAG_SCAN_THREAD (1ULL << 5) #define ZIO_FLAG_PHYSICAL (1ULL << 6) #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ #define ZIO_FLAG_CANFAIL (1ULL << 7) /* must be first for INHERIT */ #define ZIO_FLAG_SPECULATIVE (1ULL << 8) #define ZIO_FLAG_CONFIG_WRITER (1ULL << 9) #define ZIO_FLAG_DONT_RETRY (1ULL << 10) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) /* * Flags inherited by vdev children. */ #define ZIO_FLAG_IO_RETRY (1ULL << 15) /* must be first for INHERIT */ #define ZIO_FLAG_PROBE (1ULL << 16) #define ZIO_FLAG_TRYHARD (1ULL << 17) #define ZIO_FLAG_OPTIONAL (1ULL << 18) #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ #define ZIO_FLAG_DONT_QUEUE (1ULL << 19) /* must be first for INHERIT */ #define ZIO_FLAG_DONT_PROPAGATE (1ULL << 20) #define ZIO_FLAG_IO_BYPASS (1ULL << 21) #define ZIO_FLAG_IO_REWRITE (1ULL << 22) #define ZIO_FLAG_RAW_COMPRESS (1ULL << 23) #define ZIO_FLAG_RAW_ENCRYPT (1ULL << 24) #define ZIO_FLAG_GANG_CHILD (1ULL << 25) #define ZIO_FLAG_DDT_CHILD (1ULL << 26) #define ZIO_FLAG_GODFATHER (1ULL << 27) #define ZIO_FLAG_NOPWRITE (1ULL << 28) #define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_DELEGATED (1ULL << 30) #define ZIO_FLAG_FASTWRITE (1ULL << 31) #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) #define ZIO_DDT_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_GANG_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_VDEV_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL) #define ZIO_CHILD_BIT(x) (1U << (x)) #define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1U << (x))) enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, ZIO_CHILD_DDT, ZIO_CHILD_LOGICAL, ZIO_CHILD_TYPES }; #define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV) #define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG) #define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT) #define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL) #define ZIO_CHILD_ALL_BITS \ (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT) enum zio_wait_type { ZIO_WAIT_READY = 0, ZIO_WAIT_DONE, ZIO_WAIT_TYPES }; typedef void zio_done_func_t(zio_t *zio); extern int zio_exclude_metadata; extern int zio_dva_throttle_enabled; extern const char *const zio_type_name[ZIO_TYPES]; /* * A bookmark is a four-tuple that uniquely * identifies any block in the pool. By convention, the meta-objset (MOS) * is objset 0, and the meta-dnode is object 0. This covers all blocks * except root blocks and ZIL blocks, which are defined as follows: * * Root blocks (objset_phys_t) are object 0, level -1: . * ZIL blocks are bookmarked . * dmu_sync()ed ZIL data blocks are bookmarked . * dnode visit bookmarks are . * * Note: this structure is called a bookmark because its original purpose * was to remember where to resume a pool-wide traverse. * * Note: this structure is passed between userland and the kernel, and is * stored on disk (by virtue of being incorporated into other on-disk * structures, e.g. dsl_scan_phys_t). * * If the head_errlog feature is enabled a different on-disk format for error * logs is used. This introduces the use of an error bookmark, a four-tuple * that uniquely identifies any error block * in the pool. The birth transaction group is used to track whether the block * has been overwritten by newer data or added to a snapshot since its marking * as an error. */ struct zbookmark_phys { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; }; struct zbookmark_err_phys { uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; uint64_t zb_birth; }; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ (zb)->zb_objset = objset; \ (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ } #define ZB_DESTROYED_OBJSET (-1ULL) #define ZB_ROOT_OBJECT (0ULL) #define ZB_ROOT_LEVEL (-1LL) #define ZB_ROOT_BLKID (0ULL) #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) #define ZB_DNODE_LEVEL (-3LL) #define ZB_DNODE_BLKID (0ULL) #define ZB_IS_ZERO(zb) \ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ (zb)->zb_level == 0 && (zb)->zb_blkid == 0) #define ZB_IS_ROOT(zb) \ ((zb)->zb_object == ZB_ROOT_OBJECT && \ (zb)->zb_level == ZB_ROOT_LEVEL && \ (zb)->zb_blkid == ZB_ROOT_BLKID) typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; uint8_t zp_complevel; dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; boolean_t zp_brtwrite; boolean_t zp_encrypt; boolean_t zp_byteorder; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; uint32_t zp_zpl_smallblk; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, const abd_t *good_data); typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ struct dnode_phys; struct abd; struct zio_cksum_report { struct zio_cksum_report *zcr_next; nvlist_t *zcr_ereport; nvlist_t *zcr_detector; void *zcr_cbdata; size_t zcr_cbinfo; /* passed to zcr_free() */ uint64_t zcr_sector; uint64_t zcr_align; uint64_t zcr_length; zio_cksum_finish_f *zcr_finish; zio_cksum_free_f *zcr_free; /* internal use only */ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ }; typedef struct zio_vsd_ops { zio_done_func_t *vsd_free; } zio_vsd_ops_t; typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, zio_gang_node_t *gn, struct abd *data, uint64_t offset); typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size); typedef struct zio_transform { struct abd *zt_orig_abd; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; struct zio_transform *zt_next; } zio_transform_t; typedef zio_t *zio_pipe_stage_t(zio_t *zio); /* * The io_reexecute flags are distinct from io_flags because the child must * be able to propagate them to the parent. The normal io_flags are local * to the zio, not protected by any lock, and not modifiable by children; * the reexecute flags are protected by io_lock, modifiable by children, * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. */ #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 /* * The io_trim flags are used to specify the type of TRIM to perform. They * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags. */ enum trim_flag { ZIO_TRIM_SECURE = 1U << 0, }; typedef struct zio_alloc_list { list_t zal_list; uint64_t zal_size; } zio_alloc_list_t; typedef struct zio_link { zio_t *zl_parent; zio_t *zl_child; list_node_t zl_parent_node; list_node_t zl_child_node; } zio_link_t; struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; zio_prop_t io_prop; zio_type_t io_type; enum zio_child io_child_type; enum trim_flag io_trim_flags; int io_cmd; zio_priority_t io_priority; uint8_t io_reexecute; uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; blkptr_t *io_bp_override; blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; zio_t *io_logical; zio_transform_t *io_transform_stack; /* Callback info */ zio_done_func_t *io_ready; zio_done_func_t *io_children_ready; - zio_done_func_t *io_physdone; zio_done_func_t *io_done; void *io_private; int64_t io_prev_space_delta; /* DMU private */ blkptr_t io_bp_orig; /* io_lsize != io_orig_size iff this is a raw write */ uint64_t io_lsize; /* Data represented by this I/O */ struct abd *io_abd; struct abd *io_orig_abd; uint64_t io_size; uint64_t io_orig_size; /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; const zio_vsd_ops_t *io_vsd_ops; metaslab_class_t *io_metaslab_class; /* dva throttle class */ uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_queued_timestamp; hrtime_t io_target_timestamp; hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ avl_node_t io_queue_node; avl_node_t io_offset_node; avl_node_t io_alloc_node; zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ zio_flag_t io_flags; enum zio_stage io_stage; enum zio_stage io_pipeline; zio_flag_t io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; enum zio_stage io_pipeline_trace; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; - uint64_t io_child_count; - uint64_t io_phys_children; - uint64_t io_parent_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; void *io_executor; void *io_waiter; void *io_bio; kmutex_t io_lock; kcondvar_t io_cv; int io_allocator; /* FMA state */ zio_cksum_report_t *io_cksum_report; uint64_t io_ena; /* Taskq dispatching state */ taskq_ent_t io_tqent; }; enum blk_verify_flag { BLK_VERIFY_ONLY, BLK_VERIFY_LOG, BLK_VERIFY_HALT }; enum blk_config_flag { BLK_CONFIG_HELD, // SCL_VDEV held for writer BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV }; extern int zio_bookmark_compare(const void *, const void *); extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *priv, zio_flag_t flags); extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *priv, zio_flag_t flags); extern void zio_destroy(zio_t *zio); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, - zio_done_func_t *physdone, zio_done_func_t *done, - void *priv, zio_priority_t priority, zio_flag_t flags, - const zbookmark_phys_t *zb); + zio_done_func_t *done, void *priv, zio_priority_t priority, + zio_flag_t flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite, boolean_t brtwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *priv, zio_flag_t flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *priv, zio_flag_t flags); extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, enum trim_flag trim_flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, zio_flag_t flags, boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_flag_t flags); extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); extern void zio_execute(void *zio); extern void zio_interrupt(void *zio); extern void zio_delay_init(zio_t *zio); extern void zio_delay_interrupt(zio_t *zio); extern void zio_deadman(zio_t *zio, const char *tag); extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform); extern void zio_pop_transforms(zio_t *zio); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, int type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *priv); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *priv); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); extern void zio_change_priority(zio_t *pio, zio_priority_t priority); extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); extern enum zio_checksum zio_checksum_select(enum zio_checksum child, enum zio_checksum parent); extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, enum zio_checksum parent); extern enum zio_compress zio_compress_select(spa_t *spa, enum zio_compress child, enum zio_compress parent); extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress, uint8_t child, uint8_t parent); extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify); /* * Initial setup and teardown. */ extern void zio_init(void); extern void zio_fini(void); /* * Fault injection */ struct zinject_record; extern uint32_t zio_injection_enabled; extern int zio_inject_fault(char *name, int flags, int *id, struct zinject_record *record); extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); extern void zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type); extern int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, uint64_t type, int error); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); extern hrtime_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions */ extern int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, struct zio_bad_cksum *info); extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical); extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); /* If we have the good data in hand, this function can be used */ extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, struct zio_bad_cksum *info); void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr); extern void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name); /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); /* zbookmark_phys functions */ boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); boolean_t zbookmark_subtree_tbd(const struct dnode_phys *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); #ifdef __cplusplus } #endif #endif /* _ZIO_H */ diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index a23715309f2b..7023f448182a 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -1,10927 +1,10913 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2020, Delphix. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020, The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (ranging from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the * metadata limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so * they can be reclaimed. For example, when using the ZPL each dentry * holds a references on a znode. These dentries must be pruned before * the arc buffer holding the znode can be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ /* * ARC operation: * * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. * This structure can point either to a block that is still in the cache or to * one that is only accessible in an L2 ARC device, or it can provide * information about a block that was recently evicted. If a block is * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough * information to retrieve it from the L2ARC device. This information is * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block * that is in this state cannot access the data directly. * * Blocks that are actively being referenced or have not been evicted * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the * ability to store the physical data (b_pabd) associated with the DVA of the * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC * consumer. The ARC will provide references to this data and will keep it * cached until it is no longer in use. The ARC caches only the L1ARC's physical * data block and will evict any arc_buf_t that is no longer referenced. The * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the * "overhead_size" kstat. * * Depending on the consumer, an arc_buf_t can be requested in uncompressed or * compressed form. The typical case is that consumers will want uncompressed * data, and when that happens a new data buffer is allocated where the data is * decompressed for them to use. Currently the only consumer who wants * compressed arc_buf_t's is "zfs send", when it streams data exactly as it * exists on disk. When this happens, the arc_buf_t's data buffer is shared * with the arc_buf_hdr_t. * * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The * first one is owned by a compressed send consumer (and therefore references * the same compressed data buffer as the arc_buf_hdr_t) and the second could be * used by any other consumer (and has its own uncompressed copy of the data * buffer). * * arc_buf_hdr_t * +-----------+ * | fields | * | common to | * | L1- and | * | L2ARC | * +-----------+ * | l2arc_buf_hdr_t * | | * +-----------+ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | * | +-----------+ | |b_data +-+ * +->+------+ | +-----------+ | * compressed | | | | * data | |<--------------+ | uncompressed * +------+ compressed, | data * shared +-->+------+ * data | | * | | * +------+ * * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be * the last buffer in the hdr's b_buf list, however a shared compressed buf can * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is * sharing its data with an arc_buf_t (note that the shared uncompressed buf is * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ * | | * | | * | | * +-----------+ * l2arc_buf_hdr_t| | * | | * +-----------+ * l1arc_buf_hdr_t| | * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ * +->+------+ | +---------+ | * | | | | * uncompressed | | | | * data +------+ | | * ^ +->+------+ | * | uncompressed | | | * | data | | | * | +------+ | * +---------------------------------+ * * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will memcpy the transformed on-disk block into * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pabd. The * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. * * The L1ARC has a slightly different system for storing encrypted data. * Raw (encrypted + possibly compressed) data has a few subtle differences from * data that is just compressed. The biggest difference is that it is not * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded. * The other difference is that encryption cannot be treated as a suggestion. * If a caller would prefer compressed data, but they actually wind up with * uncompressed data the worst thing that could happen is there might be a * performance hit. If the caller requests encrypted data, however, we must be * sure they actually get it or else secret information could be leaked. Raw * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore, * may have both an encrypted version and a decrypted version of its data at * once. When a caller needs a raw arc_buf_t, it is allocated and the data is * copied out of this header. To avoid complications with b_pabd, raw buffers * cannot be shared. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; #endif /* * This thread's job is to keep enough free memory in the system, by * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves * arc_available_memory(). */ static zthr_t *arc_reap_zthr; /* * This thread's job is to keep arc_size under arc_c, by calling * arc_evict(), which improves arc_is_overflowing(). */ static zthr_t *arc_evict_zthr; static arc_buf_hdr_t **arc_state_evict_markers; static int arc_state_evict_marker_count; static kmutex_t arc_evict_lock; static boolean_t arc_evict_needed = B_FALSE; static clock_t arc_last_uncached_flush; /* * Count of bytes evicted since boot. */ static uint64_t arc_evict_count; /* * List of arc_evict_waiter_t's, representing threads waiting for the * arc_evict_count to reach specific values. */ static list_t arc_evict_waiters; /* * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of * the requested amount of data to be evicted. For example, by default for * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation. * Since this is above 100%, it ensures that progress is made towards getting * arc_size under arc_c. Since this is finite, it ensures that allocations * can still happen, even during the potentially long time that arc_size is * more than arc_c. */ static uint_t zfs_arc_eviction_pct = 200; /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower * value means we're more likely to evict the "correct" header (i.e. the * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ static uint_t zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ uint_t arc_grow_retry = 5; /* * Minimum time between calls to arc_kmem_reap_soon(). */ static const int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ static int zfs_arc_overflow_shift = 8; /* log2(fraction of arc to reclaim) */ uint_t arc_shrink_shift = 7; /* percent of pagecache to reclaim arc to */ #ifdef _KERNEL uint_t zfs_arc_pc_percent = 0; #endif /* * log2(fraction of ARC which must be free to allow growing). * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ uint_t arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static uint_t arc_min_prefetch_ms; static uint_t arc_min_prescient_prefetch_ms; /* * If this percent of memory is free, don't throttle. */ uint_t arc_lotsfree_percent = 10; /* * The arc has filled available memory and has now warmed up. */ boolean_t arc_warm; /* * These tunables are for performance analysis. */ uint64_t zfs_arc_max = 0; uint64_t zfs_arc_min = 0; static uint64_t zfs_arc_dnode_limit = 0; static uint_t zfs_arc_dnode_reduce_percent = 10; static uint_t zfs_arc_grow_retry = 0; static uint_t zfs_arc_shrink_shift = 0; uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* * ARC dirty data constraints for arc_tempreserve_space() throttle: * * total dirty data limit * * anon block dirty limit * * each pool's anon allowance */ static const unsigned long zfs_arc_dirty_limit_percent = 50; static const unsigned long zfs_arc_anon_limit_percent = 25; static const unsigned long zfs_arc_pool_dirty_percent = 20; /* * Enable or disable compressed arc buffers. */ int zfs_compressed_arc_enabled = B_TRUE; /* * Balance between metadata and data on ghost hits. Values above 100 * increase metadata caching by proportionally reducing effect of ghost * data hits on target data/metadata rate. */ static uint_t zfs_arc_meta_balance = 500; /* * Percentage that can be consumed by dnodes of ARC meta buffers. */ static uint_t zfs_arc_dnode_limit_percent = 10; /* * These tunables are Linux-specific */ static uint64_t zfs_arc_sys_free = 0; static uint_t zfs_arc_min_prefetch_ms = 0; static uint_t zfs_arc_min_prescient_prefetch_ms = 0; static uint_t zfs_arc_lotsfree_percent = 10; /* * Number of arc_prune threads */ static int zfs_arc_prune_task_threads = 1; /* The 7 states: */ arc_state_t ARC_anon; arc_state_t ARC_mru; arc_state_t ARC_mru_ghost; arc_state_t ARC_mfu; arc_state_t ARC_mfu_ghost; arc_state_t ARC_l2c_only; arc_state_t ARC_uncached; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "iohits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_iohits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_iohits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_iohits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_iohits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "uncached_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "access_skip", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "meta", KSTAT_DATA_UINT64 }, { "pd", KSTAT_DATA_UINT64 }, { "pm", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "compressed_size", KSTAT_DATA_UINT64 }, { "uncompressed_size", KSTAT_DATA_UINT64 }, { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, { "dbuf_size", KSTAT_DATA_UINT64 }, { "dnode_size", KSTAT_DATA_UINT64 }, { "bonus_size", KSTAT_DATA_UINT64 }, #if defined(COMPAT_FREEBSD11) { "other_size", KSTAT_DATA_UINT64 }, #endif { "anon_size", KSTAT_DATA_UINT64 }, { "anon_data", KSTAT_DATA_UINT64 }, { "anon_metadata", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, { "mru_data", KSTAT_DATA_UINT64 }, { "mru_metadata", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, { "mru_ghost_data", KSTAT_DATA_UINT64 }, { "mru_ghost_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, { "mfu_data", KSTAT_DATA_UINT64 }, { "mfu_metadata", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "uncached_size", KSTAT_DATA_UINT64 }, { "uncached_data", KSTAT_DATA_UINT64 }, { "uncached_metadata", KSTAT_DATA_UINT64 }, { "uncached_evictable_data", KSTAT_DATA_UINT64 }, { "uncached_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, { "l2_mru_asize", KSTAT_DATA_UINT64 }, { "l2_mfu_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_data_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_count", KSTAT_DATA_UINT64 }, { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, { "l2_rebuild_success", KSTAT_DATA_UINT64 }, { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, { "l2_rebuild_size", KSTAT_DATA_UINT64 }, { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "memory_all_bytes", KSTAT_DATA_UINT64 }, { "memory_free_bytes", KSTAT_DATA_UINT64 }, { "memory_available_bytes", KSTAT_DATA_INT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, { "arc_tempreserve", KSTAT_DATA_UINT64 }, { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "prescient_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, { "arc_raw_size", KSTAT_DATA_UINT64 }, { "cached_only_in_progress", KSTAT_DATA_UINT64 }, { "abd_chunk_waste_size", KSTAT_DATA_UINT64 }, }; arc_sums_t arc_sums; #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } /* * This macro allows us to use kstats as floating averages. Each time we * update this kstat, we first factor it and the update value by * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall * average. This macro assumes that integer loads and stores are atomic, but * is not safe for multiple writers updating the kstat in parallel (only the * last writer's update will remain). */ #define ARCSTAT_F_AVG_FACTOR 3 #define ARCSTAT_F_AVG(stat, value) \ do { \ uint64_t x = ARCSTAT(stat); \ x = x - x / ARCSTAT_F_AVG_FACTOR + \ (value) / ARCSTAT_F_AVG_FACTOR; \ ARCSTAT(stat) = x; \ } while (0) static kstat_t *arc_ksp; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ hrtime_t arc_growtime; list_t arc_prune_list; kmutex_t arc_prune_mtx; taskq_t *arc_prune_taskq; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_PRESCIENT_PREFETCH(hdr) \ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) #define HDR_UNCACHED(hdr) ((hdr)->b_flags & ARC_FLAG_UNCACHED) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) #define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED) #define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) #define HDR_HAS_RABD(hdr) \ (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \ (hdr)->b_crypt_hdr.b_rabd != NULL) #define HDR_ENCRYPTED(hdr) \ (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) #define HDR_AUTHENTICATED(hdr) \ (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) #define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED) /* * Other sizes */ #define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ #define BUF_LOCKS 2048 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ /* * We can feed L2ARC from two states of ARC buffers, mru and mfu, * and each of the state has two types: data and metadata. */ #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ static uint_t l2arc_meta_percent = 33; /* limit on headers size */ /* * L2ARC Internals */ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ abd_t *l2rcb_abd; /* temporary buffer */ } l2arc_read_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; typedef enum arc_fill_flags { ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */ ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */ ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ } arc_fill_flags_t; typedef enum arc_ovf_level { ARC_OVF_NONE, /* ARC within target size. */ ARC_OVF_SOME, /* ARC is slightly overflowed. */ ARC_OVF_SEVERE /* ARC is severely overflowed. */ } arc_ovf_level_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static kmutex_t l2arc_rebuild_thr_lock; static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, ARC_HDR_USE_RESERVE = 0x4, ARC_HDR_ALLOC_LINEAR = 0x8, }; static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *); static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); static void arc_hdr_destroy(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t); static void arc_buf_watch(arc_buf_t *); static void arc_change_state(arc_state_t *, arc_buf_hdr_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only); #define l2arc_hdr_arcstats_increment(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) #define l2arc_hdr_arcstats_decrement(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE) #define l2arc_hdr_arcstats_increment_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE) #define l2arc_hdr_arcstats_decrement_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) /* * l2arc_exclude_special : A zfs module parameter that controls whether buffers * present on special vdevs are eligibile for caching in L2ARC. If * set to 1, exclude dbufs on special vdevs from being cached to * L2ARC. */ int l2arc_exclude_special = 0; /* * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU * metadata and data are cached from ARC into L2ARC. */ static int l2arc_mfuonly = 0; /* * L2ARC TRIM * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of * the current write size (l2arc_write_max) we should TRIM if we * have filled the device. It is defined as a percentage of the * write size. If set to 100 we trim twice the space required to * accommodate upcoming writes. A minimum of 64MB will be trimmed. * It also enables TRIM of the whole L2ARC device upon creation or * addition to an existing pool or if the header of the device is * invalid upon importing a pool or onlining a cache device. The * default is 0, which disables TRIM on L2ARC altogether as it can * put significant stress on the underlying storage devices. This * will vary depending of how well the specific device handles * these commands. */ static uint64_t l2arc_trim_ahead = 0; /* * Performance tuning of L2ARC persistence: * * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding * an L2ARC device (either at pool import or later) will attempt * to rebuild L2ARC buffer contents. * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls * whether log blocks are written to the L2ARC device. If the L2ARC * device is less than 1GB, the amount of data l2arc_evict() * evicts is significant compared to the amount of restored L2ARC * data. In this case do not write log blocks in L2ARC in order * not to waste space. */ static int l2arc_rebuild_enabled = B_TRUE; static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; /* L2ARC persistence rebuild control routines. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg); static int l2arc_rebuild(l2arc_dev_t *dev); /* L2ARC persistence read I/O routines. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev); static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io); static zio_t *l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab); boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check); static void l2arc_blk_fetch_done(zio_t *zio); static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); } #define HDR_EMPTY(hdr) \ ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) #define HDR_EMPTY_OR_LOCKED(hdr) \ (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr))) #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fhdr; uint32_t i; ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (lockp != NULL) { *lockp = hash_lock; mutex_enter(hash_lock); } else { ASSERT(MUTEX_HELD(hash_lock)); } for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } uint64_t he = atomic_inc_64_nv( &arc_stats.arcstat_hash_elements.value.ui64); ARCSTAT_MAX(arcstat_hash_elements_max, he); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *hdr) { arc_buf_hdr_t *fhdr, **hdrp; uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_full_cache; static kmem_cache_t *hdr_full_crypt_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_free() in the linux kernel\ */ vmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #endif for (int i = 0; i < BUF_LOCKS; i++) mutex_destroy(BUF_HASH_LOCK(i)); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; memset(hdr, 0, HDR_FULL_SIZE); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); #endif multilist_link_init(&hdr->b_l1hdr.b_arc_node); list_link_init(&hdr->b_l2hdr.b_l2node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } static int hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) { (void) unused; arc_buf_hdr_t *hdr = vbuf; hdr_full_cons(vbuf, unused, kmflag); memset(&hdr->b_crypt_hdr, 0, sizeof (hdr->b_crypt_hdr)); arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); return (0); } static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; memset(hdr, 0, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } static int buf_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_t *buf = vbuf; memset(buf, 0, sizeof (arc_buf_t)); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ static void hdr_full_dest(void *vbuf, void *unused) { (void) unused; arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); #endif ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } static void hdr_full_crypt_dest(void *vbuf, void *unused) { (void) vbuf, (void) unused; hdr_full_dest(vbuf, unused); arc_space_return(sizeof (((arc_buf_hdr_t *)NULL)->b_crypt_hdr), ARC_SPACE_HDRS); } static void hdr_l2only_dest(void *vbuf, void *unused) { (void) unused; arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } static void buf_dest(void *vbuf, void *unused) { (void) unused; (void) vbuf; arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } static void buf_init(void) { uint64_t *ct = NULL; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < arc_all_memory()) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_alloc() in the linux kernel */ buf_hash_table.ht_table = vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); #else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); #endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, NULL, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL); } #define ARC_MINTIME (hz>>4) /* 62 ms */ /* * This is the size that the buf occupies in memory. If the buf is compressed, * it will correspond to the compressed size. You should use this method of * getting the buf size unless you explicitly need the logical size. */ uint64_t arc_buf_size(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); } uint64_t arc_buf_lsize(arc_buf_t *buf) { return (HDR_GET_LSIZE(buf->b_hdr)); } /* * This function will return B_TRUE if the buffer is encrypted in memory. * This buffer can be decrypted by calling arc_untransform(). */ boolean_t arc_is_encrypted(arc_buf_t *buf) { return (ARC_BUF_ENCRYPTED(buf) != 0); } /* * Returns B_TRUE if the buffer represents data that has not had its MAC * verified yet. */ boolean_t arc_is_unauthenticated(arc_buf_t *buf) { return (HDR_NOAUTH(buf->b_hdr) != 0); } void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, uint8_t *iv, uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_PROTECTED(hdr)); memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; } /* * Indicates how this buffer is compressed in memory. If it is not compressed * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with * arc_untransform() as long as it is also unencrypted. */ enum zio_compress arc_get_compression(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); } /* * Return the compression algorithm used to store this data in the ARC. If ARC * compression is enabled or this is an encrypted block, this will be the same * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF. */ static inline enum zio_compress arc_hdr_get_compress(arc_buf_hdr_t *hdr) { return (HDR_COMPRESSION_ENABLED(hdr) ? HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); } uint8_t arc_get_complevel(arc_buf_t *buf) { return (buf->b_hdr->b_complevel); } static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* * It would be nice to assert arc_can_share() too, but the "hdr isn't * already being shared" requirement prevents us from doing that. */ return (shared); } /* * Free the checksum associated with this header. If there is no checksum, this * is a no-op. */ static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { #ifdef ZFS_DEBUG ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_l1hdr.b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif } /* * Return true iff at least one of the bufs on hdr is not compressed. * Encrypted buffers count as compressed. */ static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) { ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr)); for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { if (!ARC_BUF_COMPRESSED(b)) { return (B_TRUE); } } return (B_FALSE); } /* * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data * matches the checksum that is stored in the hdr. If there is no checksum, * or if the buf is compressed, this is a no-op. */ static void arc_cksum_verify(arc_buf_t *buf) { #ifdef ZFS_DEBUG arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif } /* * This function makes the assumption that data stored in the L2ARC * will be transformed exactly as it is in the main pool. Because of * this we can verify the checksum against the reading process's bp. */ static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); /* * Block pointers always store the checksum for the logical data. * If the block pointer has the gang bit set, then the checksum * it represents is for the reconstituted data and not for an * individual gang member. The zio pipeline, however, must be able to * determine the checksum of each of the gang constituents so it * treats the checksum comparison differently than what we need * for l2arc blocks. This prevents us from using the * zio_checksum_error() interface directly. Instead we must call the * zio_checksum_error_impl() so that we can ensure the checksum is * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ return (zio_checksum_error_impl(zio->io_spa, zio->io_bp, BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); } /* * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a * checksum and attaches it to the buf's hdr so that we can ensure that the buf * isn't modified later on. If buf is compressed or there is already a checksum * on the hdr, this is a no-op (we only checksum uncompressed bufs). */ static void arc_cksum_compute(arc_buf_t *buf) { if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; #ifdef ZFS_DEBUG arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif arc_buf_watch(buf); } #ifndef _KERNEL void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { (void) sig, (void) unused; panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr); } #endif static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ | PROT_WRITE)); } #else (void) buf; #endif } static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ)); #else (void) buf; #endif } static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; } VERIFY3U(hdr->b_type, ==, type); return (type); } boolean_t arc_is_metadata(arc_buf_t *buf) { return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); } static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { switch (type) { case ARC_BUFC_DATA: /* metadata field is 0 if buffer contains normal data */ return (0); case ARC_BUFC_METADATA: return (ARC_FLAG_BUFC_METADATA); default: break; } panic("undefined ARC buffer type!"); return ((uint32_t)-1); } void arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_cksum_verify(buf); /* * Compressed buffers do not manipulate the b_freeze_cksum. */ if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); arc_buf_unwatch(buf); } void arc_buf_freeze(arc_buf_t *buf) { if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(buf->b_hdr)); arc_cksum_compute(buf); } /* * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, * the following functions should be used to ensure that the flags are * updated in a thread-safe way. When manipulating the flags either * the hash_lock must be held or the hdr must be undiscoverable. This * ensures that we're not racing with any other threads when updating * the flags. */ static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags &= ~flags; } /* * Setting the compression bits in the arc_buf_hdr_t's b_flags is * done in a special way since we have to clear and set bits * at the same time. Consumers that wish to set the compression bits * must use this function to ensure that the flags are updated in * thread-safe manner. */ static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so * we ignore the compression of the blkptr and set the * want to uncompress them. Mark them as uncompressed. */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } HDR_SET_COMPRESS(hdr, cmp); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); } /* * Looks for another buf on the same hdr which has the data decompressed, copies * from it, and returns true. If no such buf exists, returns false. */ static boolean_t arc_buf_try_copy_decompressed_data(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t copied = B_FALSE; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(buf->b_data, !=, NULL); ASSERT(!ARC_BUF_COMPRESSED(buf)); for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; from = from->b_next) { /* can't use our own data buffer */ if (from == buf) { continue; } if (!ARC_BUF_COMPRESSED(from)) { memcpy(buf->b_data, from->b_data, arc_buf_size(buf)); copied = B_TRUE; break; } } #ifdef ZFS_DEBUG /* * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ if (zfs_flags & ZFS_DEBUG_MODIFY) EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); #endif return (copied); } /* * Allocates an ARC buf header that's in an evicted & L2-cached state. * This is used during l2arc reconstruction to make empty ARC buffers * which circumvent the regular disk->arc->l2arc path and instead come * into being in the reverse order, i.e. l2arc->arc. */ static arc_buf_hdr_t * arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, enum zio_compress compress, uint8_t complevel, boolean_t protected, boolean_t prefetch, arc_state_type_t arcs_state) { arc_buf_hdr_t *hdr; ASSERT(size != 0); hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); hdr->b_birth = birth; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); HDR_SET_LSIZE(hdr, size); HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); if (prefetch) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); hdr->b_dva = dva; hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = daddr; hdr->b_l2hdr.b_arcs_state = arcs_state; return (hdr); } /* * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) { uint64_t size; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && HDR_GET_PSIZE(hdr) > 0) { size = HDR_GET_PSIZE(hdr); } else { ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); size = HDR_GET_LSIZE(hdr); } return (size); } static int arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) { int ret; uint64_t csize; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); void *tmpbuf = NULL; abd_t *abd = hdr->b_l1hdr.b_pabd; ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * The MAC is calculated on the compressed data that is stored on disk. * However, if compressed arc is disabled we will only have the * decompressed data available to us now. Compress it into a temporary * abd so we can verify the MAC. The performance overhead of this will * be relatively low, since most objects in an encrypted objset will * be encrypted (instead of authenticated) anyway. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { csize = zio_compress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel); ASSERT3P(tmpbuf, !=, NULL); ASSERT3U(csize, <=, psize); abd = abd_get_from_buf(tmpbuf, lsize); abd_take_ownership_of_buf(abd, B_TRUE); abd_zero_off(abd, csize, psize - csize); } /* * Authentication is best effort. We authenticate whenever the key is * available. If we succeed we clear ARC_FLAG_NOAUTH. */ if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); ASSERT3U(lsize, ==, psize); ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); } else { ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_crypt_hdr.b_mac); } if (ret == 0) arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); else if (ret != ENOENT) goto error; if (tmpbuf != NULL) abd_free(abd); return (0); error: if (tmpbuf != NULL) abd_free(abd); return (ret); } /* * This function will take a header that only has raw encrypted data in * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in * b_l1hdr.b_pabd. If designated in the header flags, this function will * also decompress the data. */ static int arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) { int ret; abd_t *cabd = NULL; void *tmp = NULL; boolean_t no_crypt = B_FALSE; boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); arc_hdr_alloc_abd(hdr, 0); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) { abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); } /* * If this header has disabled arc compression but the b_pabd is * compressed after decrypting it, we need to decompress the newly * decrypted data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { /* * We want to make sure that we are correctly honoring the * zfs_abd_scatter_enabled setting, so we allocate an abd here * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; } return (0); error: arc_hdr_free_abd(hdr, B_FALSE); if (cabd != NULL) arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); return (ret); } /* * This function is called during arc_buf_fill() to prepare the header's * abd plaintext pointer for use. This involves authenticated protected * data and decrypting encrypted data into the plaintext abd. */ static int arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa, const zbookmark_phys_t *zb, boolean_t noauth) { int ret; ASSERT(HDR_PROTECTED(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); if (HDR_NOAUTH(hdr) && !noauth) { /* * The caller requested authenticated data but our data has * not been authenticated yet. Verify the MAC now if we can. */ ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset); if (ret != 0) goto error; } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) { /* * If we only have the encrypted version of the data, but the * unencrypted version was requested we take this opportunity * to store the decrypted version in the header for future use. */ ret = arc_hdr_decrypt(hdr, spa, zb); if (ret != 0) goto error; } ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); if (hash_lock != NULL) mutex_exit(hash_lock); return (0); error: if (hash_lock != NULL) mutex_exit(hash_lock); return (ret); } /* * This function is used by the dbuf code to decrypt bonus buffers in place. * The dbuf code itself doesn't have any locking for decrypting a shared dnode * block, so we use the hash lock here to protect against concurrent calls to * arc_buf_fill(). */ static void arc_buf_untransform_in_place(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; hdr->b_crypt_hdr.b_ebufcnt -= 1; } /* * Given a buf that has a data buffer attached to it, this function will * efficiently fill the buf with data of the specified compression setting from * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr * are already sharing a data buf, no copy is performed. * * If the buf is marked as compressed but uncompressed data was requested, this * will allocate a new data buffer for the buf, remove that flag, and fill the * buf with uncompressed data. You can't request a compressed buf on a hdr with * uncompressed data, and (since we haven't added support for it yet) if you * want compressed data your buf must already be marked as compressed and have * the correct-sized data buffer. */ static int arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, arc_fill_flags_t flags) { int error = 0; arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t hdr_compressed = (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0; boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr); ASSERT3P(buf->b_data, !=, NULL); IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf)); IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, HDR_ENCRYPTED(hdr)); IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, !ARC_BUF_SHARED(buf)); /* * If the caller wanted encrypted data we just need to copy it from * b_rabd and potentially byteswap it. We won't be able to do any * further transforms on it. */ if (encrypted) { ASSERT(HDR_HAS_RABD(hdr)); abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); goto byteswap; } /* * Adjust encrypted and authenticated headers to accommodate * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are * allowed to fail decryption due to keys not being loaded * without being marked as an IO error. */ if (HDR_PROTECTED(hdr)) { error = arc_fill_hdr_crypt(hdr, hash_lock, spa, zb, !!(flags & ARC_FILL_NOAUTH)); if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) { return (error); } else if (error != 0) { if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (error); } } /* * There is a special case here for dnode blocks which are * decrypting their bonus buffers. These blocks may request to * be decrypted in-place. This is necessary because there may * be many dnodes pointing into this buffer and there is * currently no method to synchronize replacing the backing * b_data buffer and updating all of the pointers. Here we use * the hash lock to ensure there are no races. If the need * arises for other types to be decrypted in-place, they must * add handling here as well. */ if ((flags & ARC_FILL_IN_PLACE) != 0) { ASSERT(!hdr_compressed); ASSERT(!compressed); ASSERT(!encrypted); if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); if (hash_lock != NULL) mutex_enter(hash_lock); arc_buf_untransform_in_place(buf); if (hash_lock != NULL) mutex_exit(hash_lock); /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); } return (0); } if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { ASSERT(hdr_compressed); ASSERT(!compressed); /* * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); /* We increased the size of b_data; update overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); } /* * Regardless of the buf's previous compression settings, it * should not be compressed at the end of this function. */ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; /* * Try copying the data from another buf which already has a * decompressed version. If that's not possible, it's time to * bite the bullet and decompress the data from the hdr. */ if (arc_buf_try_copy_decompressed_data(buf)) { /* Skip byteswapping and checksumming (already done) */ return (0); } else { error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); /* * Absent hardware errors or software bugs, this should * be impossible, but log it anyway so we can debug it. */ if (error != 0) { zfs_dbgmsg( "hdr %px, compress %d, psize %d, lsize %d", hdr, arc_hdr_get_compress(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (SET_ERROR(EIO)); } } } byteswap: /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); return (0); } /* * If this function is being called to decrypt an encrypted buffer or verify an * authenticated one, the key must be loaded and a mapping must be made * available in the keystore via spa_keystore_create_mapping() or one of its * callers. */ int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, boolean_t in_place) { int ret; arc_fill_flags_t flags = 0; if (in_place) flags |= ARC_FILL_IN_PLACE; ret = arc_buf_fill(buf, spa, zb, flags); if (ret == ECKSUM) { /* * Convert authentication and decryption errors to EIO * (and generate an ereport) before leaving the ARC. */ ret = SET_ERROR(EIO); spa_log_error(spa, zb, &buf->b_hdr->b_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } return (ret); } /* * Increment the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Decrement the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Add a reference to this hdr indicating that someone is actively * referencing that memory. When the refcount transitions from 0 to 1, * we remove it from the respective arc_state_t list to indicate that * it is not evictable. */ static void add_reference(arc_buf_hdr_t *hdr, const void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && state != arc_anon && state != arc_l2c_only) { /* We don't use the L2-only state list. */ multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_decrement(hdr, state); } } /* * Remove a reference from this hdr. When the reference transitions from * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's * list making it eligible for eviction. */ static int remove_reference(arc_buf_hdr_t *hdr, const void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(!GHOST_STATE(state)); /* arc_l2c_only counts as a ghost. */ if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0) return (cnt); if (state == arc_anon) { arc_hdr_destroy(hdr); return (0); } if (state == arc_uncached && !HDR_PREFETCH(hdr)) { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); return (0); } multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_increment(hdr, state); return (0); } /* * Returns detailed information about a specific arc buffer. When the * state_index argument is set the function will calculate the arc header * list position for its arc state. Since this requires a linear traversal * callers are strongly encourage not to do this. However, it can be helpful * for targeted analysis so the functionality is provided. */ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { (void) state_index; arc_buf_hdr_t *hdr = ab->b_hdr; l1arc_buf_hdr_t *l1hdr = NULL; l2arc_buf_hdr_t *l2hdr = NULL; arc_state_t *state = NULL; memset(abi, 0, sizeof (arc_buf_info_t)); if (hdr == NULL) return; abi->abi_flags = hdr->b_flags; if (HDR_HAS_L1HDR(hdr)) { l1hdr = &hdr->b_l1hdr; state = l1hdr->b_state; } if (HDR_HAS_L2HDR(hdr)) l2hdr = &hdr->b_l2hdr; if (l1hdr) { abi->abi_bufcnt = l1hdr->b_bufcnt; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; abi->abi_mfu_hits = l1hdr->b_mfu_hits; abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt); } if (l2hdr) { abi->abi_l2arc_dattr = l2hdr->b_daddr; abi->abi_l2arc_hits = l2hdr->b_hits; } abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; abi->abi_state_contents = arc_buf_type(hdr); abi->abi_size = arc_hdr_size(hdr); } /* * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) { arc_state_t *old_state; int64_t refcnt; uint32_t bufcnt; boolean_t update_old, update_new; arc_buf_contents_t type = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() * in arc_read() when bringing a buffer out of the L2ARC. However, the * L1 hdr doesn't always exist when we change state to arc_anon before * destroying a header, in which case reallocating to add the L1 hdr is * pointless. */ if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); IMPLY(GHOST_STATE(old_state), bufcnt == 0); IMPLY(GHOST_STATE(new_state), bufcnt == 0); IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL); IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL); IMPLY(old_state == arc_anon, bufcnt <= 1); } else { old_state = arc_l2c_only; refcnt = 0; bufcnt = 0; update_old = B_FALSE; } update_new = update_old; if (GHOST_STATE(old_state)) update_old = B_TRUE; if (GHOST_STATE(new_state)) update_new = B_TRUE; ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(new_state, !=, old_state); /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); /* remove_reference() saves on insert. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { multilist_remove(&old_state->arcs_list[type], hdr); arc_evictable_space_decrement(hdr, old_state); } } if (new_state != arc_anon && new_state != arc_l2c_only) { /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or * anonymous), we realloc the header to add an L1hdr * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[type], hdr); arc_evictable_space_increment(hdr, new_state); } } ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); /* * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) zfs_refcount_add_many( &new_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_add_many( &new_state->arcs_size[type], arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( &new_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many( &new_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } } if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * When moving a header off of a ghost state, * the header will not contain any arc buffers. * We use the arc header pointer for the reference * which is exactly what we did when we put the * header on the ghost state. */ (void) zfs_refcount_remove_many( &old_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_remove_many( &old_state->arcs_size[type], arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many( &old_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many( &old_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } } if (HDR_HAS_L1HDR(hdr)) { hdr->b_l1hdr.b_state = new_state; if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) { l2arc_hdr_arcstats_decrement_state(hdr); hdr->b_l2hdr.b_arcs_state = new_state->arcs_state; l2arc_hdr_arcstats_increment_state(hdr); } } } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: ARCSTAT_INCR(arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, space); break; case ARC_SPACE_ABD_CHUNK_WASTE: /* * Note: this includes space wasted by all scatter ABD's, not * just those allocated by the ARC. But the vast majority of * scatter ABD's come from the ARC, because other users are * very short-lived. */ ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) ARCSTAT_INCR(arcstat_meta_used, space); aggsum_add(&arc_sums.arcstat_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, -space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: ARCSTAT_INCR(arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space); break; case ARC_SPACE_ABD_CHUNK_WASTE: ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) ARCSTAT_INCR(arcstat_meta_used, -space); ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0); aggsum_add(&arc_sums.arcstat_size, -space); } /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) { /* * The criteria for sharing a hdr's data are: * 1. the buffer is not encrypted * 2. the hdr's compression matches the buf's compression * 3. the hdr doesn't need to be byteswapped * 4. the hdr isn't already being shared * 5. the buf is either compressed or it is the last buf in the hdr list * * Criterion #5 maintains the invariant that shared uncompressed * bufs must be the final buf in the hdr's b_buf list. Reading this, you * might ask, "if a compressed buf is allocated first, won't that be the * last thing in the list?", but in that case it's impossible to create * a shared uncompressed buf anyway (because the hdr must be compressed * to have the compressed buf). You might also think that #3 is * sufficient to make this guarantee, however it's possible * (specifically in the rare L2ARC write race mentioned in * arc_buf_alloc_impl()) there will be an existing uncompressed buf that * is shareable, but wasn't at the time of its allocation. Rather than * allow a new shared uncompressed buf to be created and then shuffle * the list around to make it the last element, this simply disallows * sharing if the new buf isn't the first to be added. */ ASSERT3P(buf->b_hdr, ==, hdr); boolean_t hdr_compressed = arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF; boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; return (!ARC_BUF_ENCRYPTED(buf) && buf_compressed == hdr_compressed && hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && !HDR_SHARED_DATA(hdr) && (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); } /* * Allocate a buf for this hdr. If you care about the data that's in the hdr, * or if you want a compressed buffer, pass those flags in. Returns 0 if the * copy was made successfully, or an error code otherwise. */ static int arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, const void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; arc_fill_flags_t flags = ARC_FILL_LOCKED; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); ASSERT3P(*ret, ==, NULL); IMPLY(encrypted, compressed); buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = hdr->b_l1hdr.b_buf; buf->b_flags = 0; add_reference(hdr, tag); /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually * compressed. This must be overridden if the buffer is encrypted since * encrypted buffers cannot be decompressed. */ if (encrypted) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED; flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED; } else if (compressed && arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; flags |= ARC_FILL_COMPRESSED; } if (noauth) { ASSERT0(encrypted); flags |= ARC_FILL_NOAUTH; } /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new * buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be * actively involved in an L2ARC write, because if this buf is used by * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. * Second, the hdr's ABD must be linear so that the buf's user doesn't * need to be ABD-aware. It must be allocated via * zio_[data_]buf_alloc(), not as a page, because we need to be able * to abd_release_ownership_of_buf(), which isn't allowed on "linear * page" buffers because the ABD code needs to handle freeing them * specially. */ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd) && !abd_is_linear_page(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { buf->b_data = arc_get_data_buf(hdr, arc_buf_size(buf), buf); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_bufcnt += 1; if (encrypted) hdr->b_crypt_hdr.b_ebufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or * decompress the data. */ if (fill) { ASSERT3P(zb, !=, NULL); return (arc_buf_fill(buf, spa, zb, flags)); } return (0); } static const char *arc_onloan_tag = "onloan"; static inline void arc_loaned_bytes_update(int64_t delta) { atomic_add_64(&arc_loaned_bytes, delta); /* assert that it did not wrap around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); } /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, psize, lsize, compression_type, complevel); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, byteorder, salt, iv, mac, ot, psize, lsize, compression_type, complevel); atomic_add_64(&arc_loaned_bytes, psize); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); arc_loaned_bytes_update(-arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); arc_loaned_bytes_update(arc_buf_size(buf)); } static void l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } static void arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, hdr); } (void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } if (free_rdata) { l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type); } else { l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } } /* * Share the arc_buf_t's data with the hdr. Whenever we are sharing the * data buffer, we transfer the refcount ownership to the hdr and update * the appropriate kstats. */ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Start sharing the data buffer. We transfer the * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ zfs_refcount_transfer_ownership_many( &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need * to increment its compressed and uncompressed kstats and * decrement the overhead size. */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ zfs_refcount_transfer_ownership_many( &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); abd_free(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between * the arc buf and the hdr, count it as overhead. */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* * Remove an arc_buf_t from the hdr's buf list and return the last * arc_buf_t on the list. If no buffers remain on the list then return * NULL. */ static arc_buf_t * arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; arc_buf_t *lastbuf = NULL; /* * Remove the buf from the hdr list and locate the last * remaining buffer on the list. */ while (*bufp != NULL) { if (*bufp == buf) *bufp = buf->b_next; /* * If we've removed a buffer in the middle of * the list then update the lastbuf and update * bufp. */ if (*bufp != NULL) { lastbuf = *bufp; bufp = &(*bufp)->b_next; } } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); } /* * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's * list and free it. */ static void arc_buf_destroy_impl(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Free up the data associated with the buf but only if we're not * sharing this with the hdr. If we are sharing it with the hdr, the * hdr is responsible for doing the free. */ if (buf->b_data != NULL) { /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); if (arc_buf_is_shared(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; ASSERT(hdr->b_l1hdr.b_bufcnt > 0); hdr->b_l1hdr.b_bufcnt -= 1; if (ARC_BUF_ENCRYPTED(buf)) { hdr->b_crypt_hdr.b_ebufcnt -= 1; /* * If we have no more encrypted buffers and we've * already gotten a copy of the decrypted data we can * free b_rabd to save some space. */ if (hdr->b_crypt_hdr.b_ebufcnt == 0 && HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) { arc_hdr_free_abd(hdr, B_TRUE); } } } arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * * There is an equivalent case for compressed bufs, but since * they aren't guaranteed to be the last buf in the list and * that is an exceedingly rare case, we just allow that space be * wasted temporarily. We must also be careful not to share * encrypted buffers, since they cannot be shared. */ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ VERIFY(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_abd(hdr, B_FALSE); /* * We must setup a new shared block between the * last buffer and the hdr. The data would have * been allocated by the arc buf so we need to transfer * ownership to the hdr since it's now being shared. */ arc_share_buf(hdr, lastbuf); } } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT3P(lastbuf, !=, NULL); ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); } /* * Free the checksum if we're removing the last uncompressed buf from * this hdr. */ if (!arc_hdr_has_uncompressed_buf(hdr)) { arc_cksum_free(hdr); } /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata); IMPLY(alloc_rdata, HDR_PROTECTED(hdr)); if (alloc_rdata) { size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ARCSTAT_INCR(arcstat_compressed_size, size); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) { uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); IMPLY(free_rdata, HDR_HAS_RABD(hdr)); /* * If the hdr is currently being written to the l2arc then * we defer freeing the data by adding it to the l2arc_free_on_write * list. The l2arc will free the data once it's finished * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { arc_hdr_free_on_write(hdr, free_rdata); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else if (free_rdata) { arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr); } else { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr); } if (free_rdata) { hdr->b_crypt_hdr.b_rabd = NULL; ARCSTAT_INCR(arcstat_raw_size, -size); } else { hdr->b_l1hdr.b_pabd = NULL; } if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -size); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } /* * Allocate empty anonymous ARC header. The header will get its identity * assigned and buffers attached later as part of read or write operations. * * In case of read arc_read() assigns header its identify (b_dva + b_birth), * inserts it into ARC hash to become globally visible and allocates physical * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read * completion arc_read_done() allocates ARC buffer(s) as needed, potentially * sharing one of them with the physical ABD buffer. * * In case of write arc_alloc_buf() allocates ARC buffer to be filled with * data. Then after compression and/or encryption arc_write_ready() allocates * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD * buffer. On disk write completion arc_write_done() assigns the header its * new identity (b_dva + b_birth) and inserts into ARC hash. * * In case of partial overwrite the old data is read first as described. Then * arc_release() either allocates new anonymous ARC header and moves the ARC * buffer to it, or reuses the old ARC header by discarding its identity and * removing it from ARC hash. After buffer modification normal write process * follows as described. */ static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, boolean_t protected, enum zio_compress compression_type, uint8_t complevel, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); if (protected) { hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); } else { hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); } ASSERT(HDR_EMPTY(hdr)); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); hdr->b_spa = spa; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); } /* * Transition between the two allocation states for the arc_buf_hdr struct. * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller * version is used when a cache buffer is only in the L2ARC in order to reduce * memory usage. */ static arc_buf_hdr_t * arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { ASSERT(HDR_HAS_L2HDR(hdr)); arc_buf_hdr_t *nhdr; l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); /* * if the caller wanted a new full header and the header is to be * encrypted we will actually allocate the header from the full crypt * cache instead. The same applies to freeing from the old cache. */ if (HDR_PROTECTED(hdr) && new == hdr_full_cache) new = hdr_full_crypt_cache; if (HDR_PROTECTED(hdr) && old == hdr_full_cache) old = hdr_full_crypt_cache; nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); memcpy(nhdr, hdr, HDR_L2ONLY_SIZE); if (new == hdr_full_cache || new == hdr_full_crypt_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a * header has just come out of L2ARC, so we set its state to * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif /* * If we've reached here, We must have been called from * arc_evict_hdr(), as such we should have already been * removed from any ghost list we were previously on * (which protects us from racing with arc_evict_state), * thus no locking is needed during this check. */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } /* * The header has been reallocated so we need to re-insert it into any * lists it was on. */ (void) buf_hash_insert(nhdr, NULL); ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); mutex_enter(&dev->l2ad_mtx); /* * We must place the realloc'ed header back into the list at * the same spot. Otherwise, if it's placed earlier in the list, * l2arc_write_buffers() could find it during the function's * write phase, and try to write it out to the l2arc. */ list_insert_after(&dev->l2ad_buflist, hdr, nhdr); list_remove(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Since we're using the pointer address as the tag when * incrementing and decrementing the l2ad_alloc refcount, we * must remove the old pointer (that we're about to destroy) and * add the new pointer to the refcount. Otherwise we'd remove * the wrong pointer address when calling arc_hdr_destroy() later. */ (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); buf_discard_identity(hdr); kmem_cache_free(old, hdr); return (nhdr); } /* * This function allows an L1 header to be reallocated as a crypt * header and vice versa. If we are going to a crypt header, the * new fields will be zeroed out. */ static arc_buf_hdr_t * arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) { arc_buf_hdr_t *nhdr; arc_buf_t *buf; kmem_cache_t *ncache, *ocache; /* * This function requires that hdr is in the arc_anon state. * Therefore it won't have any L2ARC data for us to worry * about copying. */ ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node)); ASSERT3P(hdr->b_hash_next, ==, NULL); if (need_crypt) { ncache = hdr_full_crypt_cache; ocache = hdr_full_cache; } else { ncache = hdr_full_cache; ocache = hdr_full_crypt_cache; } nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); /* * Copy all members that aren't locks or condvars to the new header. * No lists are pointing to us (as we asserted above), so we don't * need to worry about the list nodes. */ nhdr->b_dva = hdr->b_dva; nhdr->b_birth = hdr->b_birth; nhdr->b_type = hdr->b_type; nhdr->b_flags = hdr->b_flags; nhdr->b_psize = hdr->b_psize; nhdr->b_lsize = hdr->b_lsize; nhdr->b_spa = hdr->b_spa; #ifdef ZFS_DEBUG nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; #endif nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits; nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; /* * This zfs_refcount_add() exists only to ensure that the individual * arc buffers always point to a header that is referenced, avoiding * a small race condition that could trigger ASSERTs. */ (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) buf->b_hdr = nhdr; zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); if (need_crypt) { arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); } else { arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); } /* unset all members of the original hdr */ memset(&hdr->b_dva, 0, sizeof (dva_t)); hdr->b_birth = 0; hdr->b_type = 0; hdr->b_flags = 0; hdr->b_psize = 0; hdr->b_lsize = 0; hdr->b_spa = 0; #ifdef ZFS_DEBUG hdr->b_l1hdr.b_freeze_cksum = NULL; #endif hdr->b_l1hdr.b_buf = NULL; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_byteswap = 0; hdr->b_l1hdr.b_state = NULL; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_acb = NULL; hdr->b_l1hdr.b_pabd = NULL; if (ocache == hdr_full_crypt_cache) { ASSERT(!HDR_HAS_RABD(hdr)); hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; hdr->b_crypt_hdr.b_ebufcnt = 0; hdr->b_crypt_hdr.b_dsobj = 0; memset(hdr->b_crypt_hdr.b_salt, 0, ZIO_DATA_SALT_LEN); memset(hdr->b_crypt_hdr.b_iv, 0, ZIO_DATA_IV_LEN); memset(hdr->b_crypt_hdr.b_mac, 0, ZIO_DATA_MAC_LEN); } buf_discard_identity(hdr); kmem_cache_free(ocache, hdr); return (nhdr); } /* * This function is used by the send / receive code to convert a newly * allocated arc_buf_t to one that is suitable for a raw encrypted write. It * is also used to allow the root objset block to be updated without altering * its embedded MACs. Both block types will always be uncompressed so we do not * have to worry about compression type or psize. */ void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); if (!HDR_PROTECTED(hdr)) hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); if (salt != NULL) memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); if (iv != NULL) memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); if (mac != NULL) memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); } /* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, B_FALSE, ZIO_COMPRESS_OFF, 0, type); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } /* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ arc_buf_t * arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_FALSE, compression_type, complevel, ARC_BUFC_DATA); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); /* * To ensure that the hdr has the correct data in it if we call * arc_untransform() on this buf before it's been written to disk, * it's easiest if we just set up sharing between the buf and the hdr. */ arc_share_buf(hdr, buf); return (buf); } arc_buf_t * arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_hdr_t *hdr; arc_buf_t *buf; arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ? ARC_BUFC_METADATA : ARC_BUFC_DATA; ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, compression_type, complevel, type); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); /* * This buffer will be considered encrypted even if the ot is not an * encrypted type. It will become authenticated instead in * arc_write_ready(). */ buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); arc_buf_contents_t type = hdr->b_type; int64_t lsize_s; int64_t psize_s; int64_t asize_s; if (incr) { lsize_s = lsize; psize_s = psize; asize_s = asize; } else { lsize_s = -lsize; psize_s = -psize; asize_s = -asize; } /* If the buffer is a prefetch, count it as such. */ if (HDR_PREFETCH(hdr)) { ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s); } else { /* * We use the value stored in the L2 header upon initial * caching in L2ARC. This value will be updated in case * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC * metadata (log entry) cannot currently be updated. Having * the ARC state in the L2 header solves the problem of a * possibly absent L1 header (apparent in buffers restored * from persistent L2ARC). */ switch (hdr->b_l2hdr.b_arcs_state) { case ARC_STATE_MRU_GHOST: case ARC_STATE_MRU: ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s); break; case ARC_STATE_MFU_GHOST: case ARC_STATE_MFU: ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s); break; default: break; } } if (state_only) return; ARCSTAT_INCR(arcstat_l2_psize, psize_s); ARCSTAT_INCR(arcstat_l2_lsize, lsize_s); switch (type) { case ARC_BUFC_DATA: ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s); break; case ARC_BUFC_METADATA: ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s); break; default: break; } } static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); l2arc_hdr_arcstats_decrement(hdr); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || hdr->b_l1hdr.b_bufcnt > 0); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); if (!buflist_held) mutex_enter(&dev->l2ad_mtx); /* * Even though we checked this conditional above, we * need to check this again now that we have the * l2ad_mtx. This is because we could be racing with * another thread calling l2arc_evict() which might have * destroyed this header's L2 portion as we were waiting * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ if (HDR_HAS_L2HDR(hdr)) { if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); arc_hdr_l2hdr_destroy(hdr); } if (!buflist_held) mutex_exit(&dev->l2ad_mtx); } /* * The header's identify can only be safely discarded once it is no * longer discoverable. This requires removing it from the hash table * and the l2arc header list. After this point the hash lock can not * be used to protect the header. */ if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif if (!HDR_PROTECTED(hdr)) { kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_full_crypt_cache, hdr); } } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void arc_buf_destroy(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, tag)); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); arc_buf_destroy_impl(buf); (void) remove_reference(hdr, tag); mutex_exit(hash_lock); } /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost * - arc_mfu -> arc_mfu_ghost * - arc_mru_ghost -> arc_l2c_only * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted * - arc_uncached -> deleted * * Return total size of evicted data buffers for eviction progress tracking. * When evicting from ghost states return logical buffer size to make eviction * progress at the same (or at least comparable) rate as from non-ghost states. * * Return *real_evicted for actual ARC size reduction to wake up threads * waiting for it. For non-ghost states it includes size of evicted data * buffers (the headers are not freed there). For ghost states it includes * only the evicted headers size. */ static int64_t arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { /* * l2arc_write_buffers() relies on a header's L1 portion * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { ARCSTAT_BUMP(arcstat_evict_l2_skip); return (bytes_evicted); } ARCSTAT_BUMP(arcstat_deleted); bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pabd == NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ (void) arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } return (bytes_evicted); } ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached); evicted_state = (state == arc_uncached) ? arc_anon : ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost); /* prefetch buffers have a minimum lifespan */ if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < MSEC_TO_TICK(min_lifetime)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); switch (state->arcs_state) { case ARC_STATE_MRU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mru, HDR_GET_LSIZE(hdr)); break; case ARC_STATE_MFU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mfu, HDR_GET_LSIZE(hdr)); break; default: break; } } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); } } bytes_evicted += arc_hdr_size(hdr); *real_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed buffer then we * discard it here before we change states. This ensures that the * accounting is updated correctly in arc_free_data_impl(). */ if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); arc_change_state(evicted_state, hdr); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); if (evicted_state == arc_anon) { arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } else { ASSERT(HDR_IN_HASH_TABLE(hdr)); } return (bytes_evicted); } static void arc_set_need_free(void) { ASSERT(MUTEX_HELD(&arc_evict_lock)); int64_t remaining = arc_free_memory() - arc_sys_free / 2; arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters); if (aw == NULL) { arc_need_free = MAX(-remaining, 0); } else { arc_need_free = MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count)); } } static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, uint64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0, real_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint_t evict_count = zfs_arc_evict_batch_limit; ASSERT3P(marker, !=, NULL); mls = multilist_sublist_lock(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { if ((evict_count == 0) || (bytes_evicted >= bytes)) break; /* * To keep our iteration location, move the marker * forward. Since we're not holding hdr's hash lock, we * must be very careful and not remove 'hdr' from the * sublist. Otherwise, other consumers might mistake the * 'hdr' as not being on a sublist when they call the * multilist_link_active() function (they all rely on * the hash lock protecting concurrent insertions and * removals). multilist_sublist_move_forward() was * specifically implemented to ensure this is the case * (only 'marker' will be removed and re-inserted). */ multilist_sublist_move_forward(mls, marker); /* * The only case where the b_spa field should ever be * zero, is the marker headers inserted by * arc_evict_state(). It's possible for multiple threads * to be calling arc_evict_state() concurrently (e.g. * dsl_pool_close() and zio_inject_fault()), so we must * skip any markers we see from these other threads. */ if (hdr->b_spa == 0) continue; /* we're only interested in evicting buffers of a certain spa */ if (spa != 0 && hdr->b_spa != spa) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); /* * We aren't calling this function from any code path * that would already be holding a hash lock, so we're * asserting on this assumption to be defensive in case * this ever changes. Without this check, it would be * possible to incorrectly increment arcstat_mutex_miss * below (e.g. if the code changed such that we called * this function with a hash lock held). */ ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { uint64_t revicted; uint64_t evicted = arc_evict_hdr(hdr, &revicted); mutex_exit(hash_lock); bytes_evicted += evicted; real_evicted += revicted; /* * If evicted is zero, arc_evict_hdr() must have * decided to skip this header, don't increment * evict_count in this case. */ if (evicted != 0) evict_count--; } else { ARCSTAT_BUMP(arcstat_mutex_miss); } } multilist_sublist_unlock(mls); /* * Increment the count of evicted bytes, and wake up any threads that * are waiting for the count to reach this value. Since the list is * ordered by ascending aew_count, we pop off the beginning of the * list until we reach the end, or a waiter that's past the current * "count". Doing this outside the loop reduces the number of times * we need to acquire the global arc_evict_lock. * * Only wake when there's sufficient free memory in the system * (specifically, arc_sys_free/2, which by default is a bit more than * 1/64th of RAM). See the comments in arc_wait_for_eviction(). */ mutex_enter(&arc_evict_lock); arc_evict_count += real_evicted; if (arc_free_memory() > arc_sys_free / 2) { arc_evict_waiter_t *aw; while ((aw = list_head(&arc_evict_waiters)) != NULL && aw->aew_count <= arc_evict_count) { list_remove(&arc_evict_waiters, aw); cv_broadcast(&aw->aew_cv); } } arc_set_need_free(); mutex_exit(&arc_evict_lock); /* * If the ARC size is reduced from arc_c_max to arc_c_min (especially * if the average cached block is small), eviction can be on-CPU for * many seconds. To ensure that other threads that may be bound to * this CPU are able to make progress, make a voluntary preemption * call here. */ kpreempt(KPREEMPT_SYNC); return (bytes_evicted); } /* * Allocate an array of buffer headers used as placeholders during arc state * eviction. */ static arc_buf_hdr_t ** arc_state_alloc_markers(int count) { arc_buf_hdr_t **markers; markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP); for (int i = 0; i < count; i++) { markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); /* * A b_spa of 0 is used to indicate that this header is * a marker. This fact is used in arc_evict_state_impl(). */ markers[i]->b_spa = 0; } return (markers); } static void arc_state_free_markers(arc_buf_hdr_t **markers, int count) { for (int i = 0; i < count; i++) kmem_cache_free(hdr_full_cache, markers[i]); kmem_free(markers, sizeof (*markers) * count); } /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the * appropriate evict state. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so, may not catch all candidates. * It may also return without evicting as much space as requested. * * If bytes is specified using the special value ARC_EVICT_ALL, this * will evict all available (i.e. unlocked and evictable) buffers from * the given arc state; which is used by arc_flush(). */ static uint64_t arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, uint64_t bytes) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; num_sublists = multilist_get_num_sublists(ml); /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes * to evict, we want to keep trying. The markers allow us to * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ if (zthr_iscurthread(arc_evict_zthr)) { markers = arc_state_evict_markers; ASSERT3S(num_sublists, <=, arc_state_evict_marker_count); } else { markers = arc_state_alloc_markers(num_sublists); } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; mls = multilist_sublist_lock(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ while (total_evicted < bytes) { int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ for (int i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, markers[sublist_idx], spa, bytes_remaining); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; /* we've reached the end, wrap to the beginning */ if (++sublist_idx >= num_sublists) sublist_idx = 0; } /* * If we didn't evict anything during this scan, we have * no reason to believe we'll evict more during another * scan, so break the loop. */ if (scan_evicted == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); /* * When bytes is ARC_EVICT_ALL, the only way to * break the loop is when scan_evicted is zero. * In that case, we actually have evicted enough, * so we don't want to increment the kstat. */ if (bytes != ARC_EVICT_ALL) { ASSERT3S(total_evicted, <, bytes); ARCSTAT_BUMP(arcstat_evict_not_enough); } break; } } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); } if (markers != arc_state_evict_markers) arc_state_free_markers(markers, num_sublists); return (total_evicted); } /* * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might * wind up in an infinite loop, continually trying to evict buffers. */ static uint64_t arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL); if (!retry) break; } return (evicted); } /* * Evict the specified number of bytes from the state specified. This * function prevents us from trying to evict more from a state's list * than is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes) { uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, type, 0, delta)); } return (0); } /* * Adjust specified fraction, taking into account initial ghost state(s) size, * ghost hit bytes towards increasing the fraction, ghost hit bytes towards * decreasing it, plus a balance factor, controlling the decrease rate, used * to balance metadata vs data. */ static uint64_t arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down, uint_t balance) { if (total < 8 || up + down == 0) return (frac); /* * We should not have more ghost hits than ghost size, but they * may get close. Restrict maximum adjustment in that case. */ if (up + down >= total / 4) { uint64_t scale = (up + down) / (total / 8); up /= scale; down /= scale; } /* Get maximal dynamic range by choosing optimal shifts. */ int s = highbit64(total); s = MIN(64 - s, 32); uint64_t ofrac = (1ULL << 32) - frac; if (frac >= 4 * ofrac) up /= frac / (2 * ofrac + 1); up = (up << s) / (total >> (32 - s)); if (ofrac >= 4 * frac) down /= ofrac / (2 * frac + 1); down = (down << s) / (total >> (32 - s)); down = down * 100 / balance; return (frac + up - down); } /* * Evict buffers from the cache, such that arcstat_size is capped by arc_c. */ static uint64_t arc_evict(void) { uint64_t asize, bytes, total_evicted = 0; int64_t e, mrud, mrum, mfud, mfum, w; static uint64_t ogrd, ogrm, ogfd, ogfm; static uint64_t gsrd, gsrm, gsfd, gsfm; uint64_t ngrd, ngrm, ngfd, ngfm; /* Get current size of ARC states we can evict from. */ mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]); mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]); mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]); mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); uint64_t d = mrud + mfud; uint64_t m = mrum + mfum; uint64_t t = d + m; /* Get ARC ghost hits since last eviction. */ ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); uint64_t grd = ngrd - ogrd; ogrd = ngrd; ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); uint64_t grm = ngrm - ogrm; ogrm = ngrm; ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); uint64_t gfd = ngfd - ogfd; ogfd = ngfd; ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); uint64_t gfm = ngfm - ogfm; ogfm = ngfm; /* Adjust ARC states balance based on ghost hits. */ arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm, grm + gfm, grd + gfd, zfs_arc_meta_balance); arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100); arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100); asize = aggsum_value(&arc_sums.arcstat_size); int64_t wt = t - (asize - arc_c); /* * Try to reduce pinned dnodes if more than 3/4 of wanted metadata * target is not evictable or if they go over arc_dnode_limit. */ int64_t prune = 0; int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); w = wt * (int64_t)(arc_meta >> 16) >> 16; if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) > w * 3 / 4) { prune = dn / sizeof (dnode_t) * zfs_arc_dnode_reduce_percent / 100; } else if (dn > arc_dnode_limit) { prune = (dn - arc_dnode_limit) / sizeof (dnode_t) * zfs_arc_dnode_reduce_percent / 100; } if (prune > 0) arc_prune_async(prune); /* Evict MRU metadata. */ w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w)); bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e); total_evicted += bytes; mrum -= bytes; asize -= bytes; /* Evict MFU metadata. */ w = wt * (int64_t)(arc_meta >> 16) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w)); bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e); total_evicted += bytes; mfum -= bytes; asize -= bytes; /* Evict MRU data. */ wt -= m - total_evicted; w = wt * (int64_t)(arc_pd >> 16) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w)); bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e); total_evicted += bytes; mrud -= bytes; asize -= bytes; /* Evict MFU data. */ e = asize - arc_c; bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e); mfud -= bytes; total_evicted += bytes; /* * Evict ghost lists * * Size of each state's ghost list represents how much that state * may grow by shrinking the other states. Would it need to shrink * other states to zero (that is unlikely), its ghost size would be * equal to sum of other three state sizes. But excessive ghost * size may result in false ghost hits (too far back), that may * never result in real cache hits if several states are competing. * So choose some arbitraty point of 1/2 of other state sizes. */ gsrd = (mrum + mfud + mfum) / 2; e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) - gsrd; (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e); gsrm = (mrud + mfud + mfum) / 2; e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) - gsrm; (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e); gsfd = (mrud + mrum + mfum) / 2; e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) - gsfd; (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e); gsfm = (mrud + mrum + mfud) / 2; e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) - gsfm; (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e); return (total_evicted); } void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ ASSERT(!retry || spa == NULL); if (spa != NULL) guid = spa_load_guid(spa); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry); } void arc_reduce_target_size(int64_t to_free) { uint64_t c = arc_c; if (c <= arc_c_min) return; /* * All callers want the ARC to actually evict (at least) this much * memory. Therefore we reduce from the lower of the current size and * the target size. This way, even if arc_c is much higher than * arc_size (as can be the case after many calls to arc_freed(), we will * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ uint64_t asize = aggsum_value(&arc_sums.arcstat_size); if (asize < c) to_free += c - asize; arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min); /* See comment in arc_evict_cb_check() on why lock+flag */ mutex_enter(&arc_evict_lock); arc_evict_needed = B_TRUE; mutex_exit(&arc_evict_lock); zthr_wakeup(arc_evict_zthr); } /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } void arc_kmem_reap_soon(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; #ifdef _KERNEL #if defined(_ILP32) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { #if defined(_ILP32) /* reach upper limit of cache size on 32-bit */ if (zio_buf_cache[i] == NULL) break; #endif if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); kmem_cache_reap_now(zfs_btree_leaf_cache); abd_cache_reap_now(); } static boolean_t arc_evict_cb_check(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; #ifdef ZFS_DEBUG /* * This is necessary in order to keep the kstat information * up to date for tools that display kstat data such as the * mdb ::arc dcmd and the Linux crash utility. These tools * typically do not call kstat's update function, but simply * dump out stats from the most recent update. Without * this call, these commands may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this call, the data might be out of date if the * evict thread hasn't been woken recently; but that should * suffice. The arc_state_t structures can be queried * directly if more accurate information is needed. */ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); #endif /* * We have to rely on arc_wait_for_eviction() to tell us when to * evict, rather than checking if we are overflowing here, so that we * are sure to not leave arc_wait_for_eviction() waiting on aew_cv. * If we have become "not overflowing" since arc_wait_for_eviction() * checked, we need to wake it up. We could broadcast the CV here, * but arc_wait_for_eviction() may have not yet gone to sleep. We * would need to use a mutex to ensure that this function doesn't * broadcast until arc_wait_for_eviction() has gone to sleep (e.g. * the arc_evict_lock). However, the lock ordering of such a lock * would necessarily be incorrect with respect to the zthr_lock, * which is held before this function is called, and is held by * arc_wait_for_eviction() when it calls zthr_wakeup(). */ if (arc_evict_needed) return (B_TRUE); /* * If we have buffers in uncached state, evict them periodically. */ return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) && ddi_get_lbolt() - arc_last_uncached_flush > MSEC_TO_TICK(arc_min_prefetch_ms / 2))); } /* * Keep arc_size under arc_c by running arc_evict which evicts data * from the ARC. */ static void arc_evict_cb(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); /* Always try to evict from uncached state. */ arc_last_uncached_flush = ddi_get_lbolt(); evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE); evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE); /* Evict from other states only if told to. */ if (arc_evict_needed) evicted += arc_evict(); /* * If evicted is zero, we couldn't evict anything * via arc_evict(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. Additionally, zthr_iscancelled() is * checked here so that if the arc is shutting down, the * broadcast will wake any remaining arc evict waiters. */ mutex_enter(&arc_evict_lock); arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; if (!arc_evict_needed) { /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * arc_get_data_impl() sooner. */ arc_evict_waiter_t *aw; while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { cv_broadcast(&aw->aew_cv); } arc_set_need_free(); } mutex_exit(&arc_evict_lock); spl_fstrans_unmark(cookie); } static boolean_t arc_reap_cb_check(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; int64_t free_memory = arc_available_memory(); static int reap_cb_check_counter = 0; /* * If a kmem reap is already active, don't schedule more. We must * check for this because kmem_cache_reap_soon() won't actually * block on the cache being reaped (this is to prevent callers from * becoming implicitly blocked by a system-wide kmem reap -- which, * on a system with many, many full magazines, can take minutes). */ if (!kmem_cache_reap_active() && free_memory < 0) { arc_no_grow = B_TRUE; arc_warm = B_TRUE; /* * Wait at least zfs_grow_retry (default 5) seconds * before considering growing. */ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); return (B_TRUE); } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (gethrtime() >= arc_growtime) { arc_no_grow = B_FALSE; } /* * Called unconditionally every 60 seconds to reclaim unused * zstd compression and decompression context. This is done * here to avoid the need for an independent thread. */ if (!((reap_cb_check_counter++) % 60)) zfs_zstd_cache_reap_now(); return (B_FALSE); } /* * Keep enough free memory in the system by reaping the ARC's kmem * caches. To cause more slabs to be reapable, we may reduce the * target size of the cache (arc_c), causing the arc_evict_cb() * to free more buffers. */ static void arc_reap_cb(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; int64_t free_memory; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * Kick off asynchronous kmem_reap()'s of all our caches. */ arc_kmem_reap_soon(); /* * Wait at least arc_kmem_cache_reap_retry_ms between * arc_kmem_reap_soon() calls. Without this check it is possible to * end up in a situation where we spend lots of time reaping * caches, while we're near arc_c_min. Waiting here also gives the * subsequent free memory check a chance of finding that the * asynchronous reap has already freed enough memory, and we don't * need to call arc_reduce_target_size(). */ delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); /* * Reduce the target size as needed to maintain the amount of free * memory in the system at a fraction of the arc_size (1/128th by * default). If oversubscribed (free_memory < 0) then reduce the * target arc_size by the deficit amount plus the fractional * amount. If free memory is positive but less than the fractional * amount, reduce by what is needed to hit the fractional amount. */ free_memory = arc_available_memory(); int64_t can_free = arc_c - arc_c_min; if (can_free > 0) { int64_t to_free = (can_free >> arc_shrink_shift) - free_memory; if (to_free > 0) arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); } #ifdef _KERNEL /* * Determine the amount of memory eligible for eviction contained in the * ARC. All clean data reported by the ghost lists can always be safely * evicted. Due to arc_c_min, the same does not hold for all clean data * contained by the regular mru and mfu lists. * * In the case of the regular mru and mfu lists, we need to report as * much clean data as possible, such that evicting that same reported * data will not bring arc_size below arc_c_min. Thus, in certain * circumstances, the total amount of clean data in the mru and mfu * lists might not actually be evictable. * * The following two distinct cases are accounted for: * * 1. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is greater than or equal to arc_c_min. * (i.e. amount of dirty data >= arc_c_min) * * This is the easy case; all clean data contained by the mru and mfu * lists is evictable. Evicting all clean data can only drop arc_size * to the amount of dirty data, which is greater than arc_c_min. * * 2. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is less than arc_c_min. * (i.e. arc_c_min > amount of dirty data) * * 2.1. arc_size is greater than or equal arc_c_min. * (i.e. arc_size >= arc_c_min > amount of dirty data) * * In this case, not all clean data from the regular mru and mfu * lists is actually evictable; we must leave enough clean data * to keep arc_size above arc_c_min. Thus, the maximum amount of * evictable data from the two lists combined, is exactly the * difference between arc_size and arc_c_min. * * 2.2. arc_size is less than arc_c_min * (i.e. arc_c_min > arc_size > amount of dirty data) * * In this case, none of the data contained in the mru and mfu * lists is evictable, even if it's clean. Since arc_size is * already below arc_c_min, evicting any more would only * increase this negative difference. */ #endif /* _KERNEL */ /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are coming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(uint64_t bytes) { /* * Wake reap thread if we do not have any available memory */ if (arc_reclaim_needed()) { zthr_wakeup(arc_reap_zthr); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ if (aggsum_upper_bound(&arc_sums.arcstat_size) + 2 * SPA_MAXBLOCKSIZE >= arc_c) { uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE); if (atomic_add_64_nv(&arc_c, dc) > arc_c_max) arc_c = arc_c_max; } } /* * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ static arc_ovf_level_t arc_is_overflowing(boolean_t use_reserve) { /* Always allow at least one block of overflow */ int64_t overflow = MAX(SPA_MAXBLOCKSIZE, arc_c >> zfs_arc_overflow_shift); /* * We just compare the lower bound here for performance reasons. Our * primary goals are to make sure that the arc never grows without * bound, and that it can reach its maximum size. This check * accomplishes both goals. The maximum amount we could run over by is * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - overflow / 2; if (!use_reserve) overflow /= 2; return (over < 0 ? ARC_OVF_NONE : over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, alloc_flags); if (alloc_flags & ARC_HDR_ALLOC_LINEAR) return (abd_alloc_linear(size, type == ARC_BUFC_METADATA)); else return (abd_alloc(size, type == ARC_BUFC_METADATA)); } static void * arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, 0); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { ASSERT(type == ARC_BUFC_DATA); return (zio_data_buf_alloc(size)); } } /* * Wait for the specified amount of data (in bytes) to be evicted from the * ARC, and for there to be sufficient free memory in the system. Waiting for * eviction ensures that the memory used by the ARC decreases. Waiting for * free memory ensures that the system won't run out of free pages, regardless * of ARC behavior and settings. See arc_lowmem_init(). */ void arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) { switch (arc_is_overflowing(use_reserve)) { case ARC_OVF_NONE: return; case ARC_OVF_SOME: /* * This is a bit racy without taking arc_evict_lock, but the * worst that can happen is we either call zthr_wakeup() extra * time due to race with other thread here, or the set flag * get cleared by arc_evict_cb(), which is unlikely due to * big hysteresis, but also not important since at this level * of overflow the eviction is purely advisory. Same time * taking the global lock here every time without waiting for * the actual eviction creates a significant lock contention. */ if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } return; case ARC_OVF_SEVERE: default: { arc_evict_waiter_t aw; list_link_init(&aw.aew_node); cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); uint64_t last_count = 0; mutex_enter(&arc_evict_lock); if (!list_is_empty(&arc_evict_waiters)) { arc_evict_waiter_t *last = list_tail(&arc_evict_waiters); last_count = last->aew_count; } else if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } /* * Note, the last waiter's count may be less than * arc_evict_count if we are low on memory in which * case arc_evict_state_impl() may have deferred * wakeups (but still incremented arc_evict_count). */ aw.aew_count = MAX(last_count, arc_evict_count) + amount; list_insert_tail(&arc_evict_waiters, &aw); arc_set_need_free(); DTRACE_PROBE3(arc__wait__for__eviction, uint64_t, amount, uint64_t, arc_evict_count, uint64_t, aw.aew_count); /* * We will be woken up either when arc_evict_count reaches * aew_count, or when the ARC is no longer overflowing and * eviction completes. * In case of "false" wakeup, we will still be on the list. */ do { cv_wait(&aw.aew_cv, &arc_evict_lock); } while (list_link_active(&aw.aew_node)); mutex_exit(&arc_evict_lock); cv_destroy(&aw.aew_cv); } } } /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { arc_adapt(size); /* * If arc_size is currently overflowing, we must be adding data * faster than we are evicting. To ensure we don't compound the * problem by adding more data and forcing arc_size to grow even * further past it's target size, we wait for the eviction thread to * make some progress. We also wait for there to be sufficient free * memory in the system, as measured by arc_free_memory(). * * Specifically, we wait for zfs_arc_eviction_pct percent of the * requested size to be evicted. This should be more than 100%, to * ensure that that progress is also made towards getting arc_size * under arc_c. See the comment above zfs_arc_eviction_pct. */ arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, alloc_flags & ARC_HDR_USE_RESERVE); arc_buf_contents_t type = arc_buf_type(hdr); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { arc_space_consume(size, ARC_SPACE_DATA); } /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ arc_state_t *state = hdr->b_l1hdr.b_state; if (!GHOST_STATE(state)) { (void) zfs_refcount_add_many(&state->arcs_size[type], size, tag); /* * If this is reached via arc_read, the link is * protected by the hash lock. If reached via * arc_buf_alloc, the header should not be accessed by * any other thread. And, if reached via arc_read_done, * the hash lock will protect it if it's found in the * hash table; otherwise no other thread should be * trying to [add|remove]_reference it. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); (void) zfs_refcount_add_many(&state->arcs_esize[type], size, tag); } } } static void arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, const void *tag) { arc_free_data_impl(hdr, size, tag); abd_free(abd); } static void arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_free_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { zio_buf_free(buf, size); } else { ASSERT(type == ARC_BUFC_DATA); zio_data_buf_free(buf, size); } } /* * Free the arc data buffer. */ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, tag); } (void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } } /* * This routine is called whenever a buffer is accessed. */ static void arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); /* * Update buffer prefetch status. */ boolean_t was_prefetch = HDR_PREFETCH(hdr); boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH; if (was_prefetch != now_prefetch) { if (was_prefetch) { ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit, HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive, prefetch); } if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); if (was_prefetch) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); } else { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } if (now_prefetch) { if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) { arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); ARCSTAT_BUMP(arcstat_prescient_prefetch); } else { ARCSTAT_BUMP(arcstat_predictive_prefetch); } } if (arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); clock_t now = ddi_get_lbolt(); if (hdr->b_l1hdr.b_state == arc_anon) { arc_state_t *new_state; /* * This buffer is not in the cache, and does not appear in * our "ghost" lists. Add it to the MRU or uncached state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = now; if (HDR_UNCACHED(hdr)) { new_state = arc_uncached; DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *, hdr); } else { new_state = arc_mru; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mru) { /* * This buffer has been accessed once recently and either * its read is still in progress or it is in the cache. */ if (HDR_IO_IN_PROGRESS(hdr)) { hdr->b_l1hdr.b_arc_access = now; return; } hdr->b_l1hdr.b_mru_hits++; ARCSTAT_BUMP(arcstat_mru_hits); /* * If the previous access was a prefetch, then it already * handled possible promotion, so nothing more to do for now. */ if (was_prefetch) { hdr->b_l1hdr.b_arc_access = now; return; } /* * If more than ARC_MINTIME have passed from the previous * hit, promote the buffer to the MFU state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); } } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been accessed once recently, but was * evicted from the cache. Would we have bigger MRU, it * would be an MRU hit, so handle it the same way, except * we don't need to check the previous access time. */ hdr->b_l1hdr.b_mru_ghost_hits++; ARCSTAT_BUMP(arcstat_mru_ghost_hits); hdr->b_l1hdr.b_arc_access = now; wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)], arc_hdr_size(hdr)); if (was_prefetch) { new_state = arc_mru; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and either * still in the cache or being restored from one of ghosts. */ if (!HDR_IO_IN_PROGRESS(hdr)) { hdr->b_l1hdr.b_mfu_hits++; ARCSTAT_BUMP(arcstat_mfu_hits); } hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { /* * This buffer has been accessed more than once recently, but * has been evicted from the cache. Would we have bigger MFU * it would stay in cache, so move it back to MFU state. */ hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); hdr->b_l1hdr.b_arc_access = now; wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)], arc_hdr_size(hdr)); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); } else if (hdr->b_l1hdr.b_state == arc_uncached) { /* * This buffer is uncacheable, but we got a hit. Probably * a demand read after prefetch. Nothing more to do here. */ if (!HDR_IO_IN_PROGRESS(hdr)) ARCSTAT_BUMP(arcstat_uncached_hits); hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC and was not accessed * for a long time, so treat it as new and put into MRU. */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); } } /* * This routine is called by dbuf_hold() to update the arc_access() state * which otherwise would be skipped for entries in the dbuf cache. */ void arc_buf_access(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Avoid taking the hash_lock when possible as an optimization. * The header must be checked again under the hash_lock in order * to handle the case where it is concurrently being released. */ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) return; kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_access_skip); return; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu || hdr->b_l1hdr.b_state == arc_uncached); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, 0, B_TRUE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ void arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { (void) zio, (void) zb, (void) bp; if (buf == NULL) return; memcpy(arg, buf->b_data, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } /* a generic arc_read_done_func_t */ void arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { (void) zb, (void) bp; arc_buf_t **bufp = arg; if (buf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; } else { ASSERT(zio == NULL || zio->io_error == 0); *bufp = buf; ASSERT(buf->b_data != NULL); } } static void arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { ASSERT3U(arc_hdr_get_compress(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp)); } } static void arc_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; arc_callback_t *acb; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); ASSERT3P(hash_lock, !=, NULL); } if (BP_IS_PROTECTED(bp)) { hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); if (zio->io_error == 0) { if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { void *tmpbuf; tmpbuf = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmpbuf, hdr->b_crypt_hdr.b_mac); abd_return_buf(zio->io_abd, tmpbuf, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } } } if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } if (!HDR_L2_READING(hdr)) { hdr->b_complevel = zio->io_prop.zp_complevel; } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); hdr->b_l1hdr.b_acb = NULL; /* * If a read request has a callback (i.e. acb_done is not NULL), then we * make a buf containing the data according to the parameters which were * passed in. The implementation of arc_buf_alloc_impl() ensures that we * aren't needlessly decompressing the data multiple times. */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { /* We need the last one to call below in original order. */ callback_list = acb; if (!acb->acb_done || acb->acb_nobuf) continue; callback_cnt++; if (zio->io_error != 0) continue; int error = arc_buf_alloc_impl(hdr, zio->io_spa, &acb->acb_zb, acb->acb_private, acb->acb_encrypted, acb->acb_compressed, acb->acb_noauth, B_TRUE, &acb->acb_buf); /* * Assert non-speculative zios didn't fail because an * encryption key wasn't loaded */ ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) || error != EACCES); /* * If we failed to decrypt, report an error now (as the zio * layer would have done if it had done the transforms). */ if (error == ECKSUM) { ASSERT(BP_IS_PROTECTED(bp)); error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb, &zio->io_bp->blk_birth); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0); } } if (error != 0) { /* * Decompression or decryption failed. Set * io_error so that when we call acb_done * (below), we will indicate that the read * failed. Note that in the unusual case * where one callback is compressed and another * uncompressed, we will mark all of them * as failed, even though the uncompressed * one can't actually fail. In this case, * the hdr will not be anonymous, because * if there are multiple callbacks, it's * because multiple threads found the same * arc buf in the hash table. */ zio->io_error = error; } } /* * If there are multiple callbacks, we must have the hash lock, * because the only way for multiple threads to find this hdr is * in the hash table. This ensures that if there are multiple * callbacks, the hdr is not anonymous. If it were anonymous, * we couldn't use arc_buf_destroy() in the error case below. */ ASSERT(callback_cnt < 2 || hash_lock != NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); } /* * Broadcast before we drop the hash_lock to avoid the possibility * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ cv_broadcast(&hdr->b_l1hdr.b_cv); arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); (void) remove_reference(hdr, hdr); if (hash_lock != NULL) mutex_exit(hash_lock); /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done != NULL) { if (zio->io_error != 0 && acb->acb_buf != NULL) { /* * If arc_buf_alloc_impl() fails during * decompression, the buf will still be * allocated, and needs to be freed here. */ arc_buf_destroy(acb->acb_buf, acb->acb_private); acb->acb_buf = NULL; } acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, acb->acb_buf, acb->acb_private); } if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_prev; if (acb->acb_wait) { mutex_enter(&acb->acb_wait_lock); acb->acb_wait_error = zio->io_error; acb->acb_wait = B_FALSE; cv_signal(&acb->acb_wait_cv); mutex_exit(&acb->acb_wait_lock); /* acb will be freed by the waiting thread. */ } else { kmem_free(acb, sizeof (arc_callback_t)); } } } /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0; boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; arc_buf_t *buf = NULL; int rc = 0; ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_REDACTED(bp)); /* * Normally SPL_FSTRANS will already be set since kernel threads which * expect to call the DMU interfaces will set it when created. System * calls are similarly handled by setting/cleaning the bit in the * registered callback (module/os/.../zfs/zpl_*). * * External consumers such as Lustre which call the exported DMU * interfaces may not have set SPL_FSTRANS. To avoid a deadlock * on the hash_lock always set and clear the bit. */ fstrans_cookie_t cookie = spl_fstrans_mark(); top: /* * Verify the block pointer contents are reasonable. This should * always be the case since the blkptr is protected by a checksum. * However, if there is damage it's desirable to detect this early * and treat it as a checksum error. This allows an alternate blkptr * to be tried when one is available (e.g. ditto blocks). */ if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ? BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { rc = SET_ERROR(ECKSUM); goto done; } if (!embedded_bp) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } /* * Determine if we have an L1 cache hit or a cache miss. For simplicity * we maintain encrypted data separately from compressed / uncompressed * data. If the user is requesting raw encrypted data and we don't have * that in the header we will read from disk to guarantee that we can * get it even if the encryption keys aren't loaded. */ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { boolean_t is_data = !HDR_ISTYPE_METADATA(hdr); if (HDR_IO_IN_PROGRESS(hdr)) { if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); rc = SET_ERROR(ENOENT); goto done; } zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* * This is a sync read that needs to wait for * an in-flight async read. Request that the * zio have its priority upgraded. */ zio_change_priority(head_zio, priority); DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr); arc_access(hdr, *arc_flags, B_FALSE); /* * If there are multiple threads reading the same block * and that block is not yet in the ARC, then only one * thread will do the physical I/O and all other * threads will wait until that I/O completes. * Synchronous reads use the acb_wait_cv whereas nowait * reads register a callback. Both are signalled/called * in arc_read_done. * * Errors of the physical I/O may need to be propagated. * Synchronous read errors are returned here from * arc_read_done via acb_wait_error. Nowait reads * attach the acb_zio_dummy zio to pio and * arc_read_done propagates the physical I/O's io_error * to acb_zio_dummy, and thereby to pio. */ arc_callback_t *acb = NULL; if (done || pio || *arc_flags & ARC_FLAG_WAIT) { acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_nobuf = no_buf; if (*arc_flags & ARC_FLAG_WAIT) { acb->acb_wait = B_TRUE; mutex_init(&acb->acb_wait_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT, NULL); } acb->acb_zb = *zb; if (pio != NULL) { acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); } acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; if (hdr->b_l1hdr.b_acb) hdr->b_l1hdr.b_acb->acb_prev = acb; hdr->b_l1hdr.b_acb = acb; } mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_iohits); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, is_data, data, metadata, iohits); if (*arc_flags & ARC_FLAG_WAIT) { mutex_enter(&acb->acb_wait_lock); while (acb->acb_wait) { cv_wait(&acb->acb_wait_cv, &acb->acb_wait_lock); } rc = acb->acb_wait_error; mutex_exit(&acb->acb_wait_lock); mutex_destroy(&acb->acb_wait_lock); cv_destroy(&acb->acb_wait_cv); kmem_free(acb, sizeof (arc_callback_t)); } goto out; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu || hdr->b_l1hdr.b_state == arc_uncached); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, *arc_flags, B_TRUE); if (done && !no_buf) { ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ rc = arc_buf_alloc_impl(hdr, spa, zb, private, encrypted_read, compressed_read, noauth_read, B_TRUE, &buf); if (rc == ECKSUM) { /* * Convert authentication and decryption errors * to EIO (and generate an ereport if needed) * before leaving the ARC. */ rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, zb, &hdr->b_birth); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } } if (rc != 0) { arc_buf_destroy_impl(buf); buf = NULL; (void) remove_reference(hdr, private); } /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); } mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, is_data, data, metadata, hits); *arc_flags |= ARC_FLAG_CACHED; goto done; } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); if (*arc_flags & ARC_FLAG_CACHED_ONLY) { if (hash_lock != NULL) mutex_exit(hash_lock); rc = SET_ERROR(ENOENT); goto done; } if (hdr == NULL) { /* * This block is not in the cache or it has * embedded data. */ arc_buf_hdr_t *exists = NULL; hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } } else { /* * This block is in the ghost cache or encrypted data * was requested and we didn't have it. If it was * L2-only (and thus didn't have an L1 hdr), * we realloc the header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } if (GHOST_STATE(hdr->b_l1hdr.b_state)) { ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT0(zfs_refcount_count( &hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif } else if (HDR_IO_IN_PROGRESS(hdr)) { /* * If this header already had an IO in progress * and we are performing another IO to fetch * encrypted data we must wait until the first * IO completes so as not to confuse * arc_read_done(). This should be very rare * and so the performance impact shouldn't * matter. */ cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } } if (*arc_flags & ARC_FLAG_UNCACHED) { arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); if (!encrypted_read) alloc_flags |= ARC_HDR_ALLOC_LINEAR; } /* * Take additional reference for IO_IN_PROGRESS. It stops * arc_access() from putting this header without any buffers * and so other references but obviously nonevictable onto * the evictable list of MRU or MFU state. */ add_reference(hdr, hdr); if (!embedded_bp) arc_access(hdr, *arc_flags, B_FALSE); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); size = HDR_GET_PSIZE(hdr); hdr_abd = hdr->b_crypt_hdr.b_rabd; zio_flags |= ZIO_FLAG_RAW; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); hdr_abd = hdr->b_l1hdr.b_pabd; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { zio_flags |= ZIO_FLAG_RAW_COMPRESS; } /* * For authenticated bp's, we do not ask the ZIO layer * to authenticate them since this will cause the entire * IO to fail if the key isn't loaded. Instead, we * defer authentication until arc_buf_fill(), which will * verify the data when the key is available. */ if (BP_IS_AUTHENTICATED(bp)) zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } if (BP_IS_AUTHENTICATED(bp)) arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_zb = *zb; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } /* * We count both async reads and scrub IOs as asynchronous so * that both can be upgraded in the event of a cache hit while * the read IO is still in-flight. */ if (priority == ZIO_PRIORITY_ASYNC_READ || priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); /* * At this point, we have a level 1 cache miss or a blkptr * with embedded data. Try again in L2ARC if possible. */ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); /* * Skip ARC stat bump for block pointers with embedded * data. The data are read from the blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); zfs_racct_read(size, 1); } /* Check if the spa even has l2 configured */ const boolean_t spa_has_l2 = l2arc_ndev != 0 && spa->spa_l2cache.sav_count > 0; if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && (*arc_flags & ARC_FLAG_PREFETCH))) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); hdr->b_l2hdr.b_hits++; cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; /* * When Compressed ARC is disabled, but the * L2ARC block is compressed, arc_hdr_size() * will have returned LSIZE rather than PSIZE. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr) && HDR_GET_PSIZE(hdr) != 0) { size = HDR_GET_PSIZE(hdr); } asize = vdev_psize_to_asize(vd, size); if (asize != size) { abd = abd_alloc_for_io(asize, HDR_ISTYPE_METADATA(hdr)); cb->l2rcb_abd = abd; } else { abd = hdr_abd; } ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + asize <= vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ ASSERT3U(arc_hdr_get_compress(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, HDR_GET_PSIZE(hdr)); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) goto out; /* l2arc read error; goto zio_read() */ if (hash_lock != NULL) mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); /* * Only a spa with l2 should contribute to l2 * miss stats. (Including the case of having a * faulted cache device - that's also a miss.) */ if (spa_has_l2) { /* * Skip ARC stat bump for block pointers with * embedded data. The data are read from the * blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } } rzio = zio_read(pio, spa, bp, hdr_abd, size, arc_read_done, hdr, priority, zio_flags, zb); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } out: /* embedded bps don't actually go to disk */ if (!embedded_bp) spa_read_history_add(spa, zb, *arc_flags); spl_fstrans_unmark(cookie); return (rc); done: if (done) done(NULL, zb, bp, buf, private); if (pio && rc != 0) { zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); zio->io_error = rc; zio_nowait(zio); } goto out; } arc_prune_t * arc_add_prune_callback(arc_prune_func_t *func, void *private) { arc_prune_t *p; p = kmem_alloc(sizeof (*p), KM_SLEEP); p->p_pfunc = func; p->p_private = private; list_link_init(&p->p_node); zfs_refcount_create(&p->p_refcnt); mutex_enter(&arc_prune_mtx); zfs_refcount_add(&p->p_refcnt, &arc_prune_list); list_insert_head(&arc_prune_list, p); mutex_exit(&arc_prune_mtx); return (p); } void arc_remove_prune_callback(arc_prune_t *p) { boolean_t wait = B_FALSE; mutex_enter(&arc_prune_mtx); list_remove(&arc_prune_list, p); if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) wait = B_TRUE; mutex_exit(&arc_prune_mtx); /* wait for arc_prune_task to finish */ if (wait) taskq_wait_outstanding(arc_prune_taskq, 0); ASSERT0(zfs_refcount_count(&p->p_refcnt)); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; /* * We might be trying to free a block that is still doing I/O * (i.e. prefetch) or has some other reference (i.e. a dedup-ed, * dmu_sync-ed block). A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU * state and discoverable. When the txg finally syncs it detects that * the block was overridden in open context and issues an override I/O. * Since this is a dedup block, the override I/O will determine if the * block is already in the DDT. If so, then it will replace the io_bp * with the bp from the DDT and allow the I/O to finish. When the I/O * reaches the done callback, dbuf_write_override_done, it will * check to see if the io_bp and io_bp_override are identical. * If they are not, then it indicates that the bp was replaced with * the bp in the DDT and the override bp is freed. This allows * us to arrive here with a reference on a block that is being * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ if (!HDR_HAS_L1HDR(hdr) || zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if its DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; /* * If the buf is being overridden then it may already * have a hdr that is not empty. */ buf_discard_identity(hdr); arc_buf_thaw(buf); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); /* * This assignment is only valid as long as the hash_lock is * held, we must be careful not to reference state or the * b_state field after dropping the lock. */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); /* * We have to recheck this conditional again now that * we're holding the l2ad_mtx to prevent a race with * another thread which might be concurrently calling * l2arc_evict(). In that case, l2arc_evict() might have * destroyed the header's L2 portion as we were waiting * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) arc_hdr_l2hdr_destroy(hdr); mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); } /* * Do we have more than one buf? */ if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); boolean_t protected = HDR_PROTECTED(hdr); enum zio_compress compress = arc_hdr_get_compress(hdr); arc_buf_contents_t type = arc_buf_type(hdr); VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); VERIFY3S(remove_reference(hdr, tag), >, 0); if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ if (arc_buf_is_shared(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); VERIFY(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between * buf and the arc_buf_hdr_t. */ arc_unshare_buf(hdr, buf); /* * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { arc_hdr_alloc_abd(hdr, 0); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_size[type], arc_buf_size(buf), buf); if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many( &state->arcs_esize[type], arc_buf_size(buf), buf); } hdr->b_l1hdr.b_bufcnt -= 1; if (ARC_BUF_ENCRYPTED(buf)) hdr->b_crypt_hdr.b_ebufcnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); /* if this is the last uncompressed buf free the checksum */ if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); mutex_exit(hash_lock); nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; nhdr->b_l1hdr.b_bufcnt = 1; if (ARC_BUF_ENCRYPTED(buf)) nhdr->b_crypt_hdr.b_ebufcnt = 1; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; (void) zfs_refcount_add_many(&arc_anon->arcs_size[type], arc_buf_size(buf), buf); } else { ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; arc_change_state(arc_anon, hdr); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } } int arc_released(arc_buf_t *buf) { return (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp); fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_abd(hdr, B_FALSE); } } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); callback->awcb_ready(zio, buf, callback->awcb_private); if (HDR_IO_IN_PROGRESS(hdr)) { ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ } if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); if (BP_IS_PROTECTED(bp)) { /* ZIL blocks are written through zio_rewrite */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); ASSERT(HDR_PROTECTED(hdr)); if (BP_SHOULD_BYTESWAP(bp)) { if (BP_GET_LEVEL(bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } /* * If this block was written for raw encryption but the zio layer * ended up only authenticating it, adjust the buffer flags now. */ if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) { arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF) buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) { buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } /* this must be done after the buffer flags are adjusted */ arc_cksum_compute(buf); enum zio_compress compress; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); compress = BP_GET_COMPRESS(bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = zio->io_prop.zp_complevel; if (zio->io_error != 0 || psize == 0) goto out; /* * Fill the hdr with data. If the buffer is encrypted we have no choice * but to copy the data into b_radb. If the hdr is compressed, the data * we want is available from the zio, otherwise we can take it from * the buf. * * We might be able to share the buf's data with the hdr here. However, * doing so would cause the ARC to be full of linear ABDs if we write a * lot of shareable data. As a compromise, we check whether scattered * ABDs are allowed, and assume that if they are then the user wants * the ARC to be primarily filled with them regardless of the data being * written. Therefore, if they're allowed then we allocate one and copy * the data into it; otherwise, we share the data directly if we can. */ if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (!(HDR_UNCACHED(hdr) || abd_size_alloc_linear(arc_buf_size(buf))) || !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the * hdr's compression setting rather than the io_bp's. */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); arc_share_buf(hdr, buf); } out: arc_hdr_verify(hdr, bp); spl_fstrans_unmark(cookie); } static void arc_write_children_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; callback->awcb_children_ready(zio, buf, callback->awcb_private); } -/* - * The SPA calls this callback for each physical write that happens on behalf - * of a logical write. See the comment in dbuf_write_physdone() for details. - */ -static void -arc_write_physdone(zio_t *zio) -{ - arc_write_callback_t *cb = zio->io_private; - if (cb->awcb_physdone != NULL) - cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); -} - static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT3U(zio->io_error, ==, 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists); arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); VERIFY3S(remove_reference(hdr, hdr), >, 0); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, 0, B_FALSE); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); VERIFY3S(remove_reference(hdr, hdr), >, 0); } callback->awcb_done(zio, buf, callback->awcb_private); abd_free(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, - arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, - arc_write_done_func_t *done, void *private, zio_priority_t priority, - int zio_flags, const zbookmark_phys_t *zb) + arc_write_done_func_t *children_ready, arc_write_done_func_t *done, + void *private, zio_priority_t priority, int zio_flags, + const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; zio_prop_t localprop = *zp; ASSERT3P(ready, !=, NULL); ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (uncached) arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); else if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (ARC_BUF_ENCRYPTED(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); localprop.zp_encrypt = B_TRUE; localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; localprop.zp_byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { localprop.zp_nopwrite = B_FALSE; localprop.zp_copies = MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (ARC_BUF_COMPRESSED(buf)) { ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; - callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; /* * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); if (!(zio_flags & ZIO_FLAG_RAW)) arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); zio = zio_write(pio, spa, txg, bp, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, - arc_write_physdone, arc_write_done, callback, - priority, zio_flags, zb); + arc_write_done, callback, priority, zio_flags, zb); return (zio); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (!arc_no_grow && reserve > arc_c/4 && reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT)) arc_c = MIN(arc_c_max, reserve * 4); /* * Throttle when the calculated memory footprint for the TXG * exceeds the target ARC size. */ if (reserve > arc_c) { DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); return (SET_ERROR(ERESTART)); } /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); anon_size = MAX((int64_t) (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(spa, reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * * In the case of one pool being built on another pool, we want * to make sure we don't end up throttling the lower (backing) * pool when the upper pool is the majority contributor to dirty * data. To insure we make forward progress during throttling, we * also check the current pool's net dirty data and only throttle * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty * data in the cache. * * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ uint64_t total_dirty = reserve + arc_tempreserve + anon_size; uint64_t spa_dirty_anon = spa_dirty_data(spa); uint64_t rarc_c = arc_warm ? arc_c : arc_c_max; if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 && anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 && spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { #ifdef ZFS_DEBUG uint64_t meta_esize = zfs_refcount_count( &arc_anon->arcs_esize[ARC_BUFC_METADATA]); uint64_t data_esize = zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n", (u_longlong_t)arc_tempreserve >> 10, (u_longlong_t)meta_esize >> 10, (u_longlong_t)data_esize >> 10, (u_longlong_t)reserve >> 10, (u_longlong_t)rarc_c >> 10); #endif DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *data, kstat_named_t *metadata, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { data->value.ui64 = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]); metadata->value.ui64 = zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); size->value.ui64 = data->value.ui64 + metadata->value.ui64; evict_data->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); as->arcstat_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_hits); as->arcstat_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_iohits); as->arcstat_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_misses); as->arcstat_demand_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_hits); as->arcstat_demand_data_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_iohits); as->arcstat_demand_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_misses); as->arcstat_demand_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_hits); as->arcstat_demand_metadata_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_iohits); as->arcstat_demand_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_misses); as->arcstat_prefetch_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_hits); as->arcstat_prefetch_data_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_iohits); as->arcstat_prefetch_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_misses); as->arcstat_prefetch_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits); as->arcstat_prefetch_metadata_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits); as->arcstat_prefetch_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses); as->arcstat_mru_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_hits); as->arcstat_mru_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_ghost_hits); as->arcstat_mfu_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_hits); as->arcstat_mfu_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_ghost_hits); as->arcstat_uncached_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_uncached_hits); as->arcstat_deleted.value.ui64 = wmsum_value(&arc_sums.arcstat_deleted); as->arcstat_mutex_miss.value.ui64 = wmsum_value(&arc_sums.arcstat_mutex_miss); as->arcstat_access_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_access_skip); as->arcstat_evict_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_skip); as->arcstat_evict_not_enough.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_not_enough); as->arcstat_evict_l2_cached.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_cached); as->arcstat_evict_l2_eligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible); as->arcstat_evict_l2_eligible_mfu.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu); as->arcstat_evict_l2_eligible_mru.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru); as->arcstat_evict_l2_ineligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_ineligible); as->arcstat_evict_l2_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_skip); as->arcstat_hash_collisions.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_collisions); as->arcstat_hash_chains.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_chains); as->arcstat_size.value.ui64 = aggsum_value(&arc_sums.arcstat_size); as->arcstat_compressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_compressed_size); as->arcstat_uncompressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_uncompressed_size); as->arcstat_overhead_size.value.ui64 = wmsum_value(&arc_sums.arcstat_overhead_size); as->arcstat_hdr_size.value.ui64 = wmsum_value(&arc_sums.arcstat_hdr_size); as->arcstat_data_size.value.ui64 = wmsum_value(&arc_sums.arcstat_data_size); as->arcstat_metadata_size.value.ui64 = wmsum_value(&arc_sums.arcstat_metadata_size); as->arcstat_dbuf_size.value.ui64 = wmsum_value(&arc_sums.arcstat_dbuf_size); #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + wmsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, &as->arcstat_anon_data, &as->arcstat_anon_metadata, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, &as->arcstat_mru_data, &as->arcstat_mru_metadata, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, &as->arcstat_mru_ghost_data, &as->arcstat_mru_ghost_metadata, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, &as->arcstat_mfu_data, &as->arcstat_mfu_metadata, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_data, &as->arcstat_mfu_ghost_metadata, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); arc_kstat_update_state(arc_uncached, &as->arcstat_uncached_size, &as->arcstat_uncached_data, &as->arcstat_uncached_metadata, &as->arcstat_uncached_evictable_data, &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = wmsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_hits); as->arcstat_l2_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_misses); as->arcstat_l2_prefetch_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_prefetch_asize); as->arcstat_l2_mru_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mru_asize); as->arcstat_l2_mfu_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mfu_asize); as->arcstat_l2_bufc_data_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize); as->arcstat_l2_bufc_metadata_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize); as->arcstat_l2_feeds.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_feeds); as->arcstat_l2_rw_clash.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rw_clash); as->arcstat_l2_read_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_read_bytes); as->arcstat_l2_write_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_write_bytes); as->arcstat_l2_writes_sent.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_sent); as->arcstat_l2_writes_done.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_done); as->arcstat_l2_writes_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_error); as->arcstat_l2_writes_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry); as->arcstat_l2_evict_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry); as->arcstat_l2_evict_reading.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_reading); as->arcstat_l2_evict_l1cached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_l1cached); as->arcstat_l2_free_on_write.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_free_on_write); as->arcstat_l2_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_abort_lowmem); as->arcstat_l2_cksum_bad.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_cksum_bad); as->arcstat_l2_io_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_io_error); as->arcstat_l2_lsize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_lsize); as->arcstat_l2_psize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_psize); as->arcstat_l2_hdr_size.value.ui64 = aggsum_value(&arc_sums.arcstat_l2_hdr_size); as->arcstat_l2_log_blk_writes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_writes); as->arcstat_l2_log_blk_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_asize); as->arcstat_l2_log_blk_count.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_count); as->arcstat_l2_rebuild_success.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_success); as->arcstat_l2_rebuild_abort_unsupported.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported); as->arcstat_l2_rebuild_abort_io_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors); as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); as->arcstat_l2_rebuild_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem); as->arcstat_l2_rebuild_size.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_size); as->arcstat_l2_rebuild_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_asize); as->arcstat_l2_rebuild_bufs.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs); as->arcstat_l2_rebuild_bufs_precached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached); as->arcstat_l2_rebuild_log_blks.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks); as->arcstat_memory_throttle_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_throttle_count); as->arcstat_memory_direct_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_direct_count); as->arcstat_memory_indirect_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_indirect_count); as->arcstat_memory_all_bytes.value.ui64 = arc_all_memory(); as->arcstat_memory_free_bytes.value.ui64 = arc_free_memory(); as->arcstat_memory_available_bytes.value.i64 = arc_available_memory(); as->arcstat_prune.value.ui64 = wmsum_value(&arc_sums.arcstat_prune); as->arcstat_meta_used.value.ui64 = wmsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); as->arcstat_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_predictive_prefetch); as->arcstat_demand_hit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch); as->arcstat_demand_iohit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch); as->arcstat_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_prescient_prefetch); as->arcstat_demand_hit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch); as->arcstat_demand_iohit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch); as->arcstat_raw_size.value.ui64 = wmsum_value(&arc_sums.arcstat_raw_size); as->arcstat_cached_only_in_progress.value.ui64 = wmsum_value(&arc_sums.arcstat_cached_only_in_progress); as->arcstat_abd_chunk_waste_size.value.ui64 = wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size); return (0); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the ARC eviction * code is laid out; arc_evict_state() assumes ARC buffers are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ static unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; /* * We rely on b_dva to generate evenly distributed index * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given * arc_buf_hdr_t will remain constant throughout its lifetime * (i.e. its b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. In this context full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } static unsigned int arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj) { panic("Header %p insert into arc_l2c_only %p", obj, ml); } #define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \ if ((do_warn) && (tuning) && ((tuning) != (value))) { \ cmn_err(CE_WARN, \ "ignoring tunable %s (using %llu instead)", \ (#tuning), (u_longlong_t)(value)); \ } \ } while (0) /* * Called during module initialization and periodically thereafter to * apply reasonable changes to the exposed performance tunings. Can also be * called explicitly by param_set_arc_*() functions when ARC tunables are * updated manually. Non-zero zfs_* values which differ from the currently set * values will be applied. */ void arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_min <= arc_c_max)) { arc_c_min = zfs_arc_min; arc_c = MAX(arc_c, arc_c_min); } WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose); /* Valid range: 64M - */ if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) && (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = MIN(arc_c, arc_c_max); if (arc_dnode_limit > arc_c_max) arc_dnode_limit = arc_c_max; } WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); /* Valid range: 0 - */ arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100; WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) arc_grow_retry = zfs_arc_grow_retry; /* Valid range: 1 - N */ if (zfs_arc_shrink_shift) { arc_shrink_shift = zfs_arc_shrink_shift; arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } /* Valid range: 1 - N ms */ if (zfs_arc_min_prefetch_ms) arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; /* Valid range: 1 - N ms */ if (zfs_arc_min_prescient_prefetch_ms) { arc_min_prescient_prefetch_ms = zfs_arc_min_prescient_prefetch_ms; } /* Valid range: 0 - 100 */ if (zfs_arc_lotsfree_percent <= 100) arc_lotsfree_percent = zfs_arc_lotsfree_percent; WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent, verbose); /* Valid range: 0 - */ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) arc_sys_free = MIN(zfs_arc_sys_free, allmem); WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose); } static void arc_state_multilist_init(multilist_t *ml, multilist_sublist_index_func_t *index_func, int *maxcountp) { multilist_create(ml, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func); *maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml)); } static void arc_state_init(void) { int num_sublists = 0; arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. Special index function asserts that. */ arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], arc_state_l2c_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], arc_state_l2c_multilist_index_func, &num_sublists); /* * Keep track of the number of markers needed to reclaim buffers from * any ARC state. The markers will be pre-allocated so as to minimize * the number of memory allocations performed by the eviction thread. */ arc_state_evict_marker_count = num_sublists; zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0); wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0); wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_sums.arcstat_hits, 0); wmsum_init(&arc_sums.arcstat_iohits, 0); wmsum_init(&arc_sums.arcstat_misses, 0); wmsum_init(&arc_sums.arcstat_demand_data_hits, 0); wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_data_misses, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_mru_hits, 0); wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_uncached_hits, 0); wmsum_init(&arc_sums.arcstat_deleted, 0); wmsum_init(&arc_sums.arcstat_mutex_miss, 0); wmsum_init(&arc_sums.arcstat_access_skip, 0); wmsum_init(&arc_sums.arcstat_evict_skip, 0); wmsum_init(&arc_sums.arcstat_evict_not_enough, 0); wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0); wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0); wmsum_init(&arc_sums.arcstat_hash_collisions, 0); wmsum_init(&arc_sums.arcstat_hash_chains, 0); aggsum_init(&arc_sums.arcstat_size, 0); wmsum_init(&arc_sums.arcstat_compressed_size, 0); wmsum_init(&arc_sums.arcstat_uncompressed_size, 0); wmsum_init(&arc_sums.arcstat_overhead_size, 0); wmsum_init(&arc_sums.arcstat_hdr_size, 0); wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); wmsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0); wmsum_init(&arc_sums.arcstat_l2_feeds, 0); wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0); wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0); wmsum_init(&arc_sums.arcstat_l2_writes_done, 0); wmsum_init(&arc_sums.arcstat_l2_writes_error, 0); wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0); wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0); wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0); wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0); wmsum_init(&arc_sums.arcstat_l2_io_error, 0); wmsum_init(&arc_sums.arcstat_l2_lsize, 0); wmsum_init(&arc_sums.arcstat_l2_psize, 0); aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0); wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0); wmsum_init(&arc_sums.arcstat_memory_direct_count, 0); wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0); wmsum_init(&arc_sums.arcstat_prune, 0); wmsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_raw_size, 0); wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0); wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; arc_mfu->arcs_state = ARC_STATE_MFU; arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; arc_uncached->arcs_state = ARC_STATE_UNCACHED; } static void arc_state_fini(void) { zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]); wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); wmsum_fini(&arc_sums.arcstat_hits); wmsum_fini(&arc_sums.arcstat_iohits); wmsum_fini(&arc_sums.arcstat_misses); wmsum_fini(&arc_sums.arcstat_demand_data_hits); wmsum_fini(&arc_sums.arcstat_demand_data_iohits); wmsum_fini(&arc_sums.arcstat_demand_data_misses); wmsum_fini(&arc_sums.arcstat_demand_metadata_hits); wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits); wmsum_fini(&arc_sums.arcstat_demand_metadata_misses); wmsum_fini(&arc_sums.arcstat_prefetch_data_hits); wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_data_misses); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses); wmsum_fini(&arc_sums.arcstat_mru_hits); wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); wmsum_fini(&arc_sums.arcstat_mfu_hits); wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits); wmsum_fini(&arc_sums.arcstat_uncached_hits); wmsum_fini(&arc_sums.arcstat_deleted); wmsum_fini(&arc_sums.arcstat_mutex_miss); wmsum_fini(&arc_sums.arcstat_access_skip); wmsum_fini(&arc_sums.arcstat_evict_skip); wmsum_fini(&arc_sums.arcstat_evict_not_enough); wmsum_fini(&arc_sums.arcstat_evict_l2_cached); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru); wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible); wmsum_fini(&arc_sums.arcstat_evict_l2_skip); wmsum_fini(&arc_sums.arcstat_hash_collisions); wmsum_fini(&arc_sums.arcstat_hash_chains); aggsum_fini(&arc_sums.arcstat_size); wmsum_fini(&arc_sums.arcstat_compressed_size); wmsum_fini(&arc_sums.arcstat_uncompressed_size); wmsum_fini(&arc_sums.arcstat_overhead_size); wmsum_fini(&arc_sums.arcstat_hdr_size); wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); wmsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize); wmsum_fini(&arc_sums.arcstat_l2_mru_asize); wmsum_fini(&arc_sums.arcstat_l2_mfu_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize); wmsum_fini(&arc_sums.arcstat_l2_feeds); wmsum_fini(&arc_sums.arcstat_l2_rw_clash); wmsum_fini(&arc_sums.arcstat_l2_read_bytes); wmsum_fini(&arc_sums.arcstat_l2_write_bytes); wmsum_fini(&arc_sums.arcstat_l2_writes_sent); wmsum_fini(&arc_sums.arcstat_l2_writes_done); wmsum_fini(&arc_sums.arcstat_l2_writes_error); wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_reading); wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached); wmsum_fini(&arc_sums.arcstat_l2_free_on_write); wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_cksum_bad); wmsum_fini(&arc_sums.arcstat_l2_io_error); wmsum_fini(&arc_sums.arcstat_l2_lsize); wmsum_fini(&arc_sums.arcstat_l2_psize); aggsum_fini(&arc_sums.arcstat_l2_hdr_size); wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes); wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize); wmsum_fini(&arc_sums.arcstat_l2_log_blk_count); wmsum_fini(&arc_sums.arcstat_l2_rebuild_success); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_rebuild_size); wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached); wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks); wmsum_fini(&arc_sums.arcstat_memory_throttle_count); wmsum_fini(&arc_sums.arcstat_memory_direct_count); wmsum_fini(&arc_sums.arcstat_memory_indirect_count); wmsum_fini(&arc_sums.arcstat_prune); wmsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); wmsum_fini(&arc_sums.arcstat_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_raw_size); wmsum_fini(&arc_sums.arcstat_cached_only_in_progress); wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size); } uint64_t arc_target_bytes(void) { return (arc_c); } void arc_set_limits(uint64_t allmem) { /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); /* How to set default max varies by platform. */ arc_c_max = arc_default_max(arc_c_min, allmem); } void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t), offsetof(arc_evict_waiter_t, aew_node)); arc_min_prefetch_ms = 1000; arc_min_prescient_prefetch_ms = 6000; #if defined(_KERNEL) arc_lowmem_init(); #endif arc_set_limits(allmem); #ifdef _KERNEL /* * If zfs_arc_max is non-zero at init, meaning it was set in the kernel * environment before the module was loaded, don't block setting the * maximum because it is less than arc_c_min, instead, reset arc_c_min * to a lower value. * zfs_arc_min will be handled by arc_tuning_update(). */ if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX && zfs_arc_max < allmem) { arc_c_max = zfs_arc_max; if (arc_c_min >= arc_c_max) { arc_c_min = MAX(zfs_arc_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); } } #else /* * In userland, there's only the memory pressure that we artificially * create (see arc_available_memory()). Don't let arc_c get too * small, because it can cause transactions to be larger than * arc_c, causing arc_tempreserve_space() to fail. */ arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); #endif arc_c = arc_c_min; /* * 32-bit fixed point fractions of metadata from total ARC size, * MRU data from all data and MRU metadata from all metadata. */ arc_meta = (1ULL << 32) / 4; /* Metadata is 25% of arc_c. */ arc_pd = (1ULL << 32) / 2; /* Data MRU is 50% of data. */ arc_pm = (1ULL << 32) / 2; /* Metadata MRU is 50% of metadata. */ percent = MIN(zfs_arc_dnode_limit_percent, 100); arc_dnode_limit = arc_c_max * percent / 100; /* Apply user specified tunings */ arc_tuning_update(B_TRUE); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; arc_register_hotplug(); arc_state_init(); buf_init(); list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads, defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } arc_state_evict_markers = arc_state_alloc_markers(arc_state_evict_marker_count); arc_evict_zthr = zthr_create_timer("arc_evict", arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri); arc_reap_zthr = zthr_create_timer("arc_reap", arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri); arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by a module parameter, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4G or 25% of physical memory). */ #ifdef __LP64__ if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #else if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #endif if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = allmem * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } if (zfs_wrlog_data_max == 0) { /* * dp_wrlog_total is reduced for each txg at the end of * spa_sync(). However, dp_dirty_total is reduced every time * a block is written out. Thus under normal operation, * dp_wrlog_total could grow 2 times as big as * zfs_dirty_data_max. */ zfs_wrlog_data_max = zfs_dirty_data_max * 2; } } void arc_fini(void) { arc_prune_t *p; #ifdef _KERNEL arc_lowmem_fini(); #endif /* _KERNEL */ /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } taskq_wait(arc_prune_taskq); taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); while ((p = list_remove_head(&arc_prune_list)) != NULL) { zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } mutex_exit(&arc_prune_mtx); list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); (void) zthr_cancel(arc_evict_zthr); (void) zthr_cancel(arc_reap_zthr); arc_state_free_markers(arc_state_evict_markers, arc_state_evict_marker_count); mutex_destroy(&arc_evict_lock); list_destroy(&arc_evict_waiters); /* * Free any buffers that were tagged for destruction. This needs * to occur before arc_state_fini() runs and destroys the aggsum * values which are updated when freeing scatter ABDs. */ l2arc_do_free_on_write(); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may * trigger the release of kmem magazines, which can callback to * arc_space_return() which accesses aggsums freed in act_state_fini(). */ buf_fini(); arc_state_fini(); arc_unregister_hotplug(); /* * We destroy the zthrs after all the ARC state has been * torn down to avoid the case of them receiving any * wakeup() signals after they are destroyed. */ zthr_destroy(arc_evict_zthr); zthr_destroy(arc_reap_zthr); ASSERT0(arc_loaned_bytes); } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. * * L2ARC persistence: * * When writing buffers to L2ARC, we periodically add some metadata to * make sure we can pick them up after reboot, thus dramatically reducing * the impact that any downtime has on the performance of storage systems * with large caches. * * The implementation works fairly simply by integrating the following two * modifications: * * *) When writing to the L2ARC, we occasionally write a "l2arc log block", * which is an additional piece of metadata which describes what's been * written. This allows us to rebuild the arc_buf_hdr_t structures of the * main ARC buffers. There are 2 linked-lists of log blocks headed by * dh_start_lbps[2]. We alternate which chain we append to, so they are * time-wise and offset-wise interleaved, but that is an optimization rather * than for correctness. The log block also includes a pointer to the * previous block in its chain. * * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device * for our header bookkeeping purposes. This contains a device header, * which contains our top-level reference structures. We update it each * time we write a new log block, so that we're able to locate it in the * L2ARC device. If this write results in an inconsistent device header * (e.g. due to power failure), we detect this by verifying the header's * checksum and simply fail to reconstruct the L2ARC after reboot. * * Implementation diagram: * * +=== L2ARC device (not to scale) ======================================+ * | ___two newest log block pointers__.__________ | * | / \dh_start_lbps[1] | * | / \ \dh_start_lbps[0]| * |.___/__. V V | * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| * || hdr| ^ /^ /^ / / | * |+------+ ...--\-------/ \-----/--\------/ / | * | \--------------/ \--------------/ | * +======================================================================+ * * As can be seen on the diagram, rather than using a simple linked list, * we use a pair of linked lists with alternating elements. This is a * performance enhancement due to the fact that we only find out the * address of the next log block access once the current block has been * completely read in. Obviously, this hurts performance, because we'd be * keeping the device's I/O queue at only a 1 operation deep, thus * incurring a large amount of I/O round-trip latency. Having two lists * allows us to fetch two log blocks ahead of where we are currently * rebuilding L2ARC buffers. * * On-device data structures: * * L2ARC device header: l2arc_dev_hdr_phys_t * L2ARC log block: l2arc_log_blk_phys_t * * L2ARC reconstruction: * * When writing data, we simply write in the standard rotary fashion, * evicting buffers as we go and simply writing new data over them (writing * a new log block every now and then). This obviously means that once we * loop around the end of the device, we will start cutting into an already * committed log block (and its referenced data buffers), like so: * * current write head__ __old tail * \ / * V V * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> * ^ ^^^^^^^^^___________________________________ * | \ * <> may overwrite this blk and/or its bufs --' * * When importing the pool, we detect this situation and use it to stop * our scanning process (see l2arc_rebuild). * * There is one significant caveat to consider when rebuilding ARC contents * from an L2ARC device: what about invalidated buffers? Given the above * construction, we cannot update blocks which we've already written to amend * them to remove buffers which were invalidated. Thus, during reconstruction, * we might be populating the cache with buffers for data that's not on the * main pool anymore, or may have been overwritten! * * As it turns out, this isn't a problem. Every arc_read request includes * both the DVA and, crucially, the birth TXG of the BP the caller is * looking for. So even if the cache were populated by completely rotten * blocks for data that had been long deleted and/or overwritten, we'll * never actually return bad data from the cache, since the DVA with the * birth TXG uniquely identify a block in space and time - once created, * a block is immutable on disk. The worst thing we have done is wasted * some time and memory at l2arc rebuild to reconstruct outdated ARC * entries that will get dropped from the l2arc as it is being updated * with new blocks. * * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write * hand are not restored. This is done by saving the offset (in bytes) * l2arc_evict() has evicted to in the L2ARC device header and taking it * into account when restoring buffers. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); return (B_TRUE); } static uint64_t l2arc_write_size(l2arc_dev_t *dev) { uint64_t size; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " "be greater than zero, resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; /* We need to add in the worst case scenario of log block overhead. */ size += l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { /* * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) * times the writesize, whichever is greater. */ size += MAX(64 * 1024 * 1024, (size * l2arc_trim_ahead) / 100); } /* * Make sure the write size does not exceed the size of the cache * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ if (size > dev->l2ad_end - dev->l2ad_start) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " "(guid %llu), resetting them to the default (%d)", (u_longlong_t)l2arc_log_blk_overhead(size, dev), (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; if (l2arc_trim_ahead > 1) { cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); l2arc_trim_ahead = 1; } if (arc_warm == B_FALSE) size += l2arc_write_boost; size += l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { size += MAX(64 * 1024 * 1024, (size * l2arc_trim_ahead) / 100); } } return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; ASSERT3P(next, !=, NULL); } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write(void) { l2arc_data_free_t *df; mutex_enter(&l2arc_free_on_write_mtx); while ((df = list_remove_head(l2arc_free_on_write)) != NULL) { ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_lb_abd_buf_t *abd_buf; l2arc_lb_ptr_buf_t *lb_ptr_buf; l2arc_dev_t *dev; l2arc_dev_hdr_phys_t *l2dhdr; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); /* * All writes completed, or an error was hit. */ top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. We must retry so we * don't leave the ARC_FLAG_L2_WRITING bit set. */ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); /* * We don't want to rescan the headers we've * already marked as having been written out, so * we reinsert the head node so we can pick up * where we left off. */ list_remove(buflist, head); list_insert_after(buflist, hdr, head); mutex_exit(&dev->l2ad_mtx); /* * We wait for the hash lock to become available * to try and prevent busy waiting, and increase * the chance we'll be able to acquire the lock * the next time around. */ mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * We could not have been moved into the arc_l2c_only * state while in-flight due to our ARC_FLAG_L2_WRITING * bit being set. Let's just ensure that's being enforced. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * Skipped - drop L2ARC entry and mark the header as no * longer L2 eligibile. */ if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); uint64_t psize = HDR_GET_PSIZE(hdr); l2arc_hdr_arcstats_decrement(hdr); bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } /* * Free the allocated abd buffers for writing the log blocks. * If the zio failed reclaim the allocated space and remove the * pointers to these log blocks from the log block pointer list * of the L2ARC device. */ while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { abd_free(abd_buf->abd); zio_buf_free(abd_buf, sizeof (*abd_buf)); if (zio->io_error != 0) { lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); /* * L2BLK_GET_PSIZE returns aligned size for log * blocks. */ uint64_t asize = L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); bytes_dropped += asize; ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } list_destroy(&cb->l2wcb_abd_list); if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_writes_error); /* * Restore the lbps array in the header to its previous state. * If the list of log block pointers is empty, zero out the * log block pointers in the device header. */ lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); for (int i = 0; i < 2; i++) { if (lb_ptr_buf == NULL) { /* * If the list is empty zero out the device * header. Otherwise zero out the second log * block pointer in the header. */ if (i == 0) { memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); } else { memset(&l2dhdr->dh_start_lbps[i], 0, sizeof (l2arc_log_blkptr_t)); } break; } memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, lb_ptr_buf); } } ARCSTAT_BUMP(arcstat_l2_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } static int l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) { int ret; spa_t *spa = zio->io_spa; arc_buf_hdr_t *hdr = cb->l2rcb_hdr; blkptr_t *bp = zio->io_bp; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* * ZIL data is never be written to the L2ARC, so we don't need * special handling for its unique MAC storage. */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the data was encrypted, decrypt it now. Note that * we must check the bp here and not the hdr, since the * hdr does not have its encryption parameters updated * until arc_read_done(). */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, HDR_GET_PSIZE(hdr), eabd, hdr->b_l1hdr.b_pabd, &no_crypt); if (ret != 0) { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); goto error; } /* * If we actually performed decryption, replace b_pabd * with the decrypted data. Otherwise we can just throw * our decryption buffer away. */ if (!no_crypt) { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = eabd; zio->io_abd = eabd; } else { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); } } /* * If the L2ARC block was compressed, but ARC compression * is disabled we decompress the data into a new buffer and * replace the existing data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; zio->io_abd = cabd; zio->io_size = HDR_GET_LSIZE(hdr); } return (0); error: return (ret); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { int tfm_error = 0; l2arc_read_callback_t *cb = zio->io_private; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; boolean_t valid_cksum; boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) && (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT)); ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * If the data was read into a temporary buffer, * move it and free the buffer. */ if (cb->l2rcb_abd != NULL) { ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); if (zio->io_error == 0) { if (using_rdata) { abd_copy(hdr->b_crypt_hdr.b_rabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } else { abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } } /* * The following must be done regardless of whether * there was an error: * - free the temporary buffer * - point zio to the real ARC buffer * - set zio size accordingly * These are required because zio is either re-used for * an I/O of the block in the case of the error * or the zio is passed to arc_read_done() and it * needs real data. */ abd_free(cb->l2rcb_abd); zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); if (using_rdata) { ASSERT(HDR_HAS_RABD(hdr)); zio->io_abd = zio->io_orig_abd = hdr->b_crypt_hdr.b_rabd; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; } } ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ zio->io_prop.zp_complevel = hdr->b_complevel; valid_cksum = arc_cksum_is_equal(hdr, zio); /* * b_rabd will always match the data as it exists on disk if it is * being used. Therefore if we are reading into b_rabd we do not * attempt to untransform the data. */ if (valid_cksum && !using_rdata) tfm_error = l2arc_untransform(zio, cb); if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); } else { /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!valid_cksum || tfm_error != 0) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); void *abd = (using_rdata) ? hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio = zio_read(pio, zio->io_spa, zio->io_bp, abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); /* * Original ZIO will be freed, so we need to update * ARC header with the new ZIO pointer to be used * by zio_change_priority() in arc_read(). */ for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; acb != NULL; acb = acb->acb_next) acb->acb_zio_head = zio; mutex_exit(hash_lock); zio_nowait(zio); } else { mutex_exit(hash_lock); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static multilist_sublist_t * l2arc_sublist_lock(int list_num) { multilist_t *ml = NULL; unsigned int idx; ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES); switch (list_num) { case 0: ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; default: return (NULL); } /* * Return a randomly-selected sublist. This is acceptable * because the caller feeds only a little bit of data for each * call (8MB). Subsequent calls will result in different * sublists being selected. */ idx = multilist_get_random_index(ml); return (multilist_sublist_lock(ml, idx)); } /* * Calculates the maximum overhead of L2ARC metadata log blocks for a given * L2ARC write size. l2arc_evict and l2arc_write_size need to include this * overhead in processing to make sure there is enough headroom available * when writing buffers. */ static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) { if (dev->l2ad_log_entries == 0) { return (0); } else { uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; uint64_t log_blocks = (log_entries + dev->l2ad_log_entries - 1) / dev->l2ad_log_entries; return (vdev_psize_to_asize(dev->l2ad_vdev, sizeof (l2arc_log_blk_phys_t)) * log_blocks); } } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; vdev_t *vd = dev->l2ad_vdev; boolean_t rerun; buflist = &dev->l2ad_buflist; top: rerun = B_FALSE; if (dev->l2ad_hand + distance > dev->l2ad_end) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands * to the start and iterate. This iteration does not * happen indefinitely as we make sure in * l2arc_write_size() that when the write hand is reset, * the write size does not exceed the end of the device. */ rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); if (!all) { /* * This check has to be placed after deciding whether to * iterate (rerun). */ if (dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. We have already trimmmed the * whole device. */ goto out; } else { /* * Trim the space to be evicted. */ if (vd->vdev_has_trim && dev->l2ad_evict < taddr && l2arc_trim_ahead > 0) { /* * We have to drop the spa_config lock because * vdev_trim_range() will acquire it. * l2ad_evict already accounts for the label * size. To prevent vdev_trim_ranges() from * adding it again, we subtract it from * l2ad_evict. */ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); vdev_trim_simple(vd, dev->l2ad_evict - VDEV_LABEL_START_SIZE, taddr - dev->l2ad_evict); spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, RW_READER); } /* * When rebuilding L2ARC we retrieve the evict hand * from the header of the device. Of note, l2arc_evict() * does not actually delete buffers from the cache * device, but trimming may do so depending on the * hardware implementation. Thus keeping track of the * evict hand is useful. */ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); } } retry: mutex_enter(&dev->l2ad_mtx); /* * We have to account for evicted log blocks. Run vdev_space_update() * on log blocks whose offset (in bytes) is before the evicted offset * (in bytes) by searching in the list of pointers to log blocks * present in the L2ARC device. */ for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; lb_ptr_buf = lb_ptr_buf_prev) { lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE( (lb_ptr_buf->lb_ptr)->lbp_prop); /* * We don't worry about log blocks left behind (ie * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() * will never write more than l2arc_evict() evicts. */ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { vdev_space_update(vd, -asize, 0, 0); ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); ASSERT(!HDR_EMPTY(hdr)); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto retry; } /* * A header can't be on this list if it doesn't have L2 header. */ ASSERT(HDR_HAS_L2HDR(hdr)); /* Ensure this header has finished being written. */ ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); out: /* * We need to check if we evict all buffers, otherwise we may iterate * unnecessarily. */ if (!all && rerun) { /* * Bump device hand to the device start if it is approaching the * end. l2arc_evict() has already evicted ahead for this case. */ dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; goto top; } if (!all) { /* * In case of cache device removal (all) the following * assertions may be violated without functional consequences * as the device is about to be removed. */ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } } /* * Handle any abd transforms that might be required for writing to the L2ARC. * If successful, this function will always return an abd with the data * transformed as it is on disk in a new abd of asize bytes. */ static int l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, abd_t **abd_out) { int ret; void *tmp = NULL; abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; enum zio_compress compress = HDR_GET_COMPRESS(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t size = arc_hdr_size(hdr); boolean_t ismd = HDR_ISTYPE_METADATA(hdr); boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); dsl_crypto_key_t *dck = NULL; uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 }; boolean_t no_crypt = B_FALSE; ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) || HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize); ASSERT3U(psize, <=, asize); /* * If this data simply needs its own buffer, we simply allocate it * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ if (HDR_HAS_RABD(hdr) && asize != psize) { ASSERT3U(asize, >=, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); if (psize != asize) abd_zero_off(to_write, psize, asize - psize); goto out; } if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) && !HDR_ENCRYPTED(hdr)) { ASSERT3U(size, ==, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); if (size != asize) abd_zero_off(to_write, size, asize - size); goto out; } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { /* * In some cases, we can wind up with size > asize, so * we need to opt for the larger allocation option here. * * (We also need abd_return_buf_copy in all cases because * it's an ASSERT() to modify the buffer before returning it * with arc_return_buf(), and all the compressors * write things before deciding to fail compression in nearly * every case.) */ cabd = abd_alloc_for_io(size, ismd); tmp = abd_borrow_buf(cabd, size); psize = zio_compress_data(compress, to_write, &tmp, size, hdr->b_complevel); if (psize >= asize) { psize = HDR_GET_PSIZE(hdr); abd_return_buf_copy(cabd, tmp, size); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); to_write = cabd; abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); if (psize != asize) abd_zero_off(to_write, psize, asize - psize); goto encrypt; } ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); if (psize < asize) memset((char *)tmp + psize, 0, asize - psize); psize = HDR_GET_PSIZE(hdr); abd_return_buf_copy(cabd, tmp, size); to_write = cabd; } encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); /* * If the dataset was disowned before the buffer * made it to this point, the key to re-encrypt * it won't be available. In this case we simply * won't write the buffer to the L2ARC. */ ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj, FTAG, &dck); if (ret != 0) goto error; ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key, hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) abd_copy(eabd, to_write, psize); if (psize != asize) abd_zero_off(eabd, psize, asize - psize); /* assert that the MAC we got here matches the one we saved */ ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); spa_keystore_dsl_key_rele(spa, dck, FTAG); if (to_write == cabd) abd_free(cabd); to_write = eabd; } out: ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd); *abd_out = to_write; return (0); error: if (dck != NULL) spa_keystore_dsl_key_rele(spa, dck, FTAG); if (cabd != NULL) abd_free(cabd); if (eabd != NULL) abd_free(eabd); *abd_out = NULL; return (ret); } static void l2arc_blk_fetch_done(zio_t *zio) { l2arc_read_callback_t *cb; cb = zio->io_private; if (cb->l2rcb_abd != NULL) abd_free(cb->l2rcb_abd); kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment and the * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; uint64_t write_asize, write_psize, write_lsize, headroom; boolean_t full; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; write_lsize = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); /* * Copy buffers for L2ARC writing. */ for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) { /* * If pass == 1 or 3, we cache MRU metadata and data * respectively. */ if (l2arc_mfuonly) { if (pass == 1 || pass == 3) continue; } multilist_sublist_t *mls = l2arc_sublist_lock(pass); uint64_t passed_sz = 0; VERIFY3P(mls, !=, NULL); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; abd_t *to_write = NULL; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. */ continue; } passed_sz += HDR_GET_LSIZE(hdr); if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); break; } if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); /* * If the allocated size of this buffer plus the max * size for the pending log block exceeds the evicted * target size, terminate writing buffers for this run. */ if (write_asize + asize + sizeof (l2arc_log_blk_phys_t) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; } /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); /* * If this header has b_rabd, we can use this since it * must always match the data exactly as it exists on * disk. Otherwise, the L2ARC can normally use the * hdr's data, but if we're sharing data between the * hdr and one of its bufs, L2ARC needs its own copy of * the data so that the ZIO below can't race with the * buf consumer. To ensure that this copy will be * available for the lifetime of the ZIO and be cleaned * up afterwards, we add it to the l2arc_free_on_write * queue. If we need to apply any transforms to the * data (compression, encryption) we will also need the * extra buffer. */ if (HDR_HAS_RABD(hdr) && psize == asize) { to_write = hdr->b_crypt_hdr.b_rabd; } else if ((HDR_COMPRESSION_ENABLED(hdr) || HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) && !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) && psize == asize) { to_write = hdr->b_l1hdr.b_pabd; } else { int ret; arc_buf_contents_t type = arc_buf_type(hdr); ret = l2arc_apply_transforms(spa, hdr, asize, &to_write); if (ret != 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); continue; } l2arc_free_abd_on_write(to_write, asize, type); } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; /* * Create a list to save allocated abd buffers * for l2arc_log_blk_commit(). */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; hdr->b_l2hdr.b_arcs_state = hdr->b_l1hdr.b_state->arcs_state; arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); write_lsize += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_exit(hash_lock); /* * Append buf info to current log and commit if full. * arcstat_l2_{size,asize} kstats are updated * internally. */ if (l2arc_log_blk_insert(dev, hdr)) { /* * l2ad_hand will be adjusted in * l2arc_log_blk_commit(). */ write_asize += l2arc_log_blk_commit(dev, pio, cb); } zio_nowait(wzio); } multilist_sublist_unlock(mls); if (full == B_TRUE) break; } /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); /* * Although we did not write any buffers l2ad_evict may * have advanced. */ if (dev->l2ad_evict != l2dhdr->dh_evict) l2arc_dev_hdr_update(dev); return (0); } if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; /* * Update the device header after the zio completes as * l2arc_write_done() may have updated the memory holding the log block * pointers in the device header. */ l2arc_dev_hdr_update(dev); return (write_asize); } static boolean_t l2arc_hdr_limit_reached(void) { int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size); return (arc_reclaim_needed() || (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ static __attribute__((noreturn)) void l2arc_feed_thread(void *unused) { (void) unused; callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); fstrans_cookie_t cookie; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); cookie = spl_fstrans_mark(); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_idle(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } spl_fstrans_unmark(cookie); l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { return (l2arc_vdev_get(vd) != NULL); } /* * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if * the vdev_t isn't an L2ARC device. */ l2arc_dev_t * l2arc_vdev_get(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev); } static void l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; spa_t *spa = dev->l2ad_spa; /* * The L2ARC has to hold at least the payload of one log block for * them to be restored (persistent L2ARC). The payload of a log block * depends on the amount of its log entries. We always write log blocks * with 1022 entries. How many of them are committed or restored depends * on the size of the L2ARC device. Thus the maximum payload of * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device * is less than that, we reduce the amount of committed and restored * log entries per block so as to enable persistence. */ if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { dev->l2ad_log_entries = 0; } else { dev->l2ad_log_entries = MIN((dev->l2ad_end - dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, L2ARC_LOG_BLK_MAX_ENTRIES); } /* * Read the device header, if an error is returned do not rebuild L2ARC. */ if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, * we should evict all ARC buffers and pointers to log blocks * and reclaim their space before restoring its contents to * L2ARC. */ if (reopen) { if (!l2arc_rebuild_enabled) { return; } else { l2arc_evict(dev, 0, B_TRUE); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; } } /* * Just mark the device as pending for a rebuild. We won't * be starting a rebuild in line here as it would block pool * import. Instead spa_load_impl will hand that off to an * async task which will call l2arc_spa_rebuild_start. */ dev->l2ad_rebuild = B_TRUE; } else if (spa_writeable(spa)) { /* * In this case TRIM the whole device if l2arc_trim_ahead > 0, * otherwise create a new header. We zero out the memory holding * the header to reset dh_start_lbps. If we TRIM the whole * device the new header will be written by * vdev_trim_l2arc_thread() at the end of the TRIM to update the * trim_state in the header too. When reading the header, if * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 * we opt to TRIM the whole device again. */ if (l2arc_trim_ahead > 0) { dev->l2ad_trim_all = B_TRUE; } else { memset(l2dhdr, 0, l2dhdr_asize); l2arc_dev_hdr_update(dev); } } } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); /* * Create a new l2arc device entry. */ adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; /* leave extra size for an l2arc device header */ l2dhdr_asize = adddev->l2ad_dev_hdr_asize = MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; adddev->l2ad_trim_all = B_FALSE; list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); /* * This is a list of pointers to log blocks that are still present * on the device. */ list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), offsetof(l2arc_lb_ptr_buf_t, node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); zfs_refcount_create(&adddev->l2ad_lb_asize); zfs_refcount_create(&adddev->l2ad_lb_count); /* * Decide if dev is eligible for L2ARC rebuild or whole device * trimming. This has to happen before the device is added in the * cache device list and l2arc_dev_mtx is released. Otherwise * l2arc_feed_thread() might already start writing on the * device. */ l2arc_rebuild_dev(adddev, B_FALSE); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen() * in case of onlining a cache device. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) { l2arc_dev_t *dev = NULL; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); /* * In contrast to l2arc_add_vdev() we do not have to worry about * l2arc_feed_thread() invalidating previous content when onlining a * cache device. The device parameters (l2ad*) are not cleared when * offlining the device and writing new buffers will not invalidate * all previous content. In worst case only buffers that have not had * their log block written to the device will be lost. * When onlining the cache device (ie offline->online without exporting * the pool in between) this happens: * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev() * | | * vdev_is_dead() = B_FALSE l2ad_rebuild = B_TRUE * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild * is set to B_TRUE we might write additional buffers to the device. */ l2arc_rebuild_dev(dev, reopen); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *remdev = NULL; /* * Find the device by vdev */ remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); /* * Cancel any ongoing or scheduled rebuild. */ mutex_enter(&l2arc_rebuild_thr_lock); if (remdev->l2ad_rebuild_began == B_TRUE) { remdev->l2ad_rebuild_cancel = B_TRUE; while (remdev->l2ad_rebuild == B_TRUE) cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); } mutex_exit(&l2arc_rebuild_thr_lock); /* * Remove device from global list */ mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); zfs_refcount_destroy(&remdev->l2ad_lb_asize); zfs_refcount_destroy(&remdev->l2ad_lb_count); kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_rebuild_thr_lock); cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, defclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } /* * Punches out rebuild threads for the L2ARC devices in a spa. This should * be called after pool import from the spa async thread, since starting * these threads directly from spa_import() will make them part of the * "zpool import" context and delay process exit (and thus pool import). */ void l2arc_spa_rebuild_start(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Locate the spa's l2arc devices and kick off rebuild threads. */ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { l2arc_dev_t *dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); if (dev == NULL) { /* Don't attempt a rebuild if the vdev is UNAVAIL */ continue; } mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild_began = B_TRUE; (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread, dev, 0, &p0, TS_RUN, minclsyspri); } mutex_exit(&l2arc_rebuild_thr_lock); } } /* * Main entry point for L2ARC rebuilding. */ static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg) { l2arc_dev_t *dev = arg; VERIFY(!dev->l2ad_rebuild_cancel); VERIFY(dev->l2ad_rebuild); (void) l2arc_rebuild(dev); mutex_enter(&l2arc_rebuild_thr_lock); dev->l2ad_rebuild_began = B_FALSE; dev->l2ad_rebuild = B_FALSE; mutex_exit(&l2arc_rebuild_thr_lock); thread_exit(); } /* * This function implements the actual L2ARC metadata rebuild. It: * starts reading the log block chain and restores each block's contents * to memory (reconstructing arc_buf_hdr_t's). * * Operation stops under any of the following conditions: * * 1) We reach the end of the log block chain. * 2) We encounter *any* error condition (cksum errors, io errors) */ static int l2arc_rebuild(l2arc_dev_t *dev) { vdev_t *vd = dev->l2ad_vdev; spa_t *spa = vd->vdev_spa; int err = 0; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; l2arc_log_blk_phys_t *this_lb, *next_lb; zio_t *this_io = NULL, *next_io = NULL; l2arc_log_blkptr_t lbps[2]; l2arc_lb_ptr_buf_t *lb_ptr_buf; boolean_t lock_held; this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP); next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP); /* * We prevent device removal while issuing reads to the device, * then during the rebuilding phases we drop this lock again so * that a spa_unload or device remove can be initiated - this is * safe, because the spa will signal us to stop before removing * our device and wait for us to stop. */ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); lock_held = B_TRUE; /* * Retrieve the persistent L2ARC device state. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), dev->l2ad_start); dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; vd->vdev_trim_state = l2dhdr->dh_trim_state; /* * In case the zfs module parameter l2arc_rebuild_enabled is false * we do not start the rebuild process. */ if (!l2arc_rebuild_enabled) goto out; /* Prepare the rebuild process */ memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps)); /* Start the rebuild process */ for (;;) { if (!l2arc_log_blkptr_valid(dev, &lbps[0])) break; if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], this_lb, next_lb, this_io, &next_io)) != 0) goto out; /* * Our memory pressure valve. If the system is running low * on memory, rather than swamping memory with new ARC buf * hdrs, we opt not to rebuild the L2ARC. At this point, * however, we have already set up our L2ARC dev to chain in * new metadata log blocks, so the user may choose to offline/ * online the L2ARC dev at a later time (or re-import the pool) * to reconstruct it (when there's less memory pressure). */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); cmn_err(CE_NOTE, "System running low on memory, " "aborting L2ARC rebuild."); err = SET_ERROR(ENOMEM); goto out; } spa_config_exit(spa, SCL_L2ARC, vd); lock_held = B_FALSE; /* * Now that we know that the next_lb checks out alright, we * can start reconstruction from this log block. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); l2arc_log_blk_restore(dev, this_lb, asize); /* * log block restored, include its pointer in the list of * pointers to log blocks present in the L2ARC device. */ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); memcpy(lb_ptr_buf->lb_ptr, &lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(vd, asize, 0, 0); /* * Protection against loops of log blocks: * * l2ad_hand l2ad_evict * V V * l2ad_start |=======================================| l2ad_end * -----|||----|||---|||----||| * (3) (2) (1) (0) * ---|||---|||----|||---||| * (7) (6) (5) (4) * * In this situation the pointer of log block (4) passes * l2arc_log_blkptr_valid() but the log block should not be * restored as it is overwritten by the payload of log block * (0). Only log blocks (0)-(3) should be restored. We check * whether l2ad_evict lies in between the payload starting * offset of the next log block (lbps[1].lbp_payload_start) * and the payload starting offset of the present log block * (lbps[0].lbp_payload_start). If true and this isn't the * first pass, we are looping from the beginning and we should * stop. */ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, lbps[0].lbp_payload_start, dev->l2ad_evict) && !dev->l2ad_first) goto out; kpreempt(KPREEMPT_SYNC); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild = B_FALSE; cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); err = SET_ERROR(ECANCELED); goto out; } mutex_exit(&l2arc_rebuild_thr_lock); if (spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) { lock_held = B_TRUE; break; } /* * L2ARC config lock held by somebody in writer, * possibly due to them trying to remove us. They'll * likely to want us to shut down, so after a little * delay, we check l2ad_rebuild_cancel and retry * the lock again. */ delay(1); } /* * Continue with the next log block. */ lbps[0] = lbps[1]; lbps[1] = this_lb->lb_prev_lbp; PTR_SWAP(this_lb, next_lb); this_io = next_io; next_io = NULL; } if (this_io != NULL) l2arc_log_blk_fetch_abort(this_io); out: if (next_io != NULL) l2arc_log_blk_fetch_abort(next_io); vmem_free(this_lb, sizeof (*this_lb)); vmem_free(next_lb, sizeof (*next_lb)); if (!l2arc_rebuild_enabled) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "disabled"); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_success); spa_history_log_internal(spa, "L2ARC rebuild", NULL, "successful, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { /* * No error but also nothing restored, meaning the lbps array * in the device header points to invalid/non-present log * blocks. Reset the header. */ spa_history_log_internal(spa, "L2ARC rebuild", NULL, "no valid log blocks"); memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); } else if (err == ECANCELED) { /* * In case the rebuild was canceled do not log to spa history * log as the pool may be in the process of being removed. */ zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err != 0) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } if (lock_held) spa_config_exit(spa, SCL_L2ARC, vd); return (err); } /* * Attempts to read the device header on the provided L2ARC device and writes * it to `hdr'. On success, this function returns 0, otherwise the appropriate * error code is returned. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev) { int err; uint64_t guid; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; guid = spa_guid(dev->l2ad_vdev->vdev_spa); abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); if (err != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); return (err); } if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || l2dhdr->dh_spa_guid != guid || l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, l2dhdr->dh_evict) || (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && l2arc_trim_ahead > 0)) { /* * Attempt to rebuild a device containing no actual dev hdr * or containing a header from some other pool or from another * version of persistent L2ARC. */ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); return (SET_ERROR(ENOTSUP)); } return (0); } /* * Reads L2ARC log blocks from storage and validates their contents. * * This function implements a simple fetcher to make sure that while * we're processing one buffer the L2ARC is already fetching the next * one in the chain. * * The arguments this_lp and next_lp point to the current and next log block * address in the block chain. Similarly, this_lb and next_lb hold the * l2arc_log_blk_phys_t's of the current and next L2ARC blk. * * The `this_io' and `next_io' arguments are used for block fetching. * When issuing the first blk IO during rebuild, you should pass NULL for * `this_io'. This function will then issue a sync IO to read the block and * also issue an async IO to fetch the next block in the block chain. The * fetched IO is returned in `next_io'. On subsequent calls to this * function, pass the value returned in `next_io' from the previous call * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. * Prior to the call, you should initialize your `next_io' pointer to be * NULL. If no fetch IO was issued, the pointer is left set at NULL. * * On success, this function returns 0, otherwise it returns an appropriate * error code. On error the fetching IO is aborted and cleared before * returning from this function. Therefore, if we return `success', the * caller can assume that we have taken care of cleanup of fetch IOs. */ static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io) { int err = 0; zio_cksum_t cksum; abd_t *abd = NULL; uint64_t asize; ASSERT(this_lbp != NULL && next_lbp != NULL); ASSERT(this_lb != NULL && next_lb != NULL); ASSERT(next_io != NULL && *next_io == NULL); ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); /* * Check to see if we have issued the IO for this log block in a * previous run. If not, this is the first call, so issue it now. */ if (this_io == NULL) { this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, this_lb); } /* * Peek to see if we can start issuing the next IO immediately. */ if (l2arc_log_blkptr_valid(dev, next_lbp)) { /* * Start issuing IO for the next log block early - this * should help keep the L2ARC device busy while we * decompress and restore this log block. */ *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, next_lb); } /* Wait for the IO to read this log block to complete */ if ((err = zio_wait(this_io)) != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " "offset: %llu, vdev guid: %llu", err, (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid); goto cleanup; } /* * Make sure the buffer checks out. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); fletcher_4_native(this_lb, asize, NULL, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid, (u_longlong_t)dev->l2ad_hand, (u_longlong_t)dev->l2ad_evict); err = SET_ERROR(ECKSUM); goto cleanup; } /* Now we can take our time decoding this buffer */ switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: abd = abd_alloc_for_io(asize, B_TRUE); abd_copy_from_buf_off(abd, this_lb, 0, asize); if ((err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) { err = SET_ERROR(EINVAL); goto cleanup; } break; default: err = SET_ERROR(EINVAL); goto cleanup; } if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(this_lb, sizeof (*this_lb)); if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { err = SET_ERROR(EINVAL); goto cleanup; } cleanup: /* Abort an in-flight fetch I/O in case of error */ if (err != 0 && *next_io != NULL) { l2arc_log_blk_fetch_abort(*next_io); *next_io = NULL; } if (abd != NULL) abd_free(abd); return (err); } /* * Restores the payload of a log block to ARC. This creates empty ARC hdr * entries which only contain an l2arc hdr, essentially restoring the * buffers to their L2ARC evicted state. This function also updates space * usage on the L2ARC vdev to make sure it tracks restored buffers. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize) { uint64_t size = 0, asize = 0; uint64_t log_entries = dev->l2ad_log_entries; /* * Usually arc_adapt() is called only for data, not headers, but * since we may allocate significant amount of memory here, let ARC * grow its arc_c. */ arc_adapt(log_entries * HDR_L2ONLY_SIZE); for (int i = log_entries - 1; i >= 0; i--) { /* * Restore goes in the reverse temporal direction to preserve * correct temporal ordering of buffers in the l2ad_buflist. * l2arc_hdr_restore also does a list_insert_tail instead of * list_insert_head on the l2ad_buflist: * * LIST l2ad_buflist LIST * HEAD <------ (time) ------ TAIL * direction +-----+-----+-----+-----+-----+ direction * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild * fill +-----+-----+-----+-----+-----+ * ^ ^ * | | * | | * l2arc_feed_thread l2arc_rebuild * will place new bufs here restores bufs here * * During l2arc_rebuild() the device is not used by * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. */ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); asize += vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); l2arc_hdr_restore(&lb->lb_entries[i], dev); } /* * Record rebuild stats: * size Logical size of restored buffers in the L2ARC * asize Aligned size of restored buffers in the L2ARC */ ARCSTAT_INCR(arcstat_l2_rebuild_size, size); ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); } /* * Restores a single ARC buf hdr from a log entry. The ARC buffer is put * into a state indicating that it has been evicted to L2ARC. */ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) { arc_buf_hdr_t *hdr, *exists; kmutex_t *hash_lock; arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); uint64_t asize; /* * Do all the allocation before grabbing any locks, this lets us * sleep if memory is full and we don't have to deal with failed * allocations. */ hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, dev, le->le_dva, le->le_daddr, L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, L2BLK_GET_PROTECTED((le)->le_prop), L2BLK_GET_PREFETCH((le)->le_prop), L2BLK_GET_STATE((le)->le_prop)); asize = vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((le)->le_prop)); /* * vdev_space_update() has to be called before arc_hdr_destroy() to * avoid underflow since the latter also calls vdev_space_update(). */ l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); mutex_exit(&dev->l2ad_mtx); exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* Buffer was already cached, no need to restore it. */ arc_hdr_destroy(hdr); /* * If the buffer is already cached, check whether it has * L2ARC metadata. If not, enter them and update the flag. * This is important is case of onlining a cache device, since * we previously evicted all L2ARC metadata from ARC. */ if (!HDR_HAS_L2HDR(exists)) { arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); exists->b_l2hdr.b_dev = dev; exists->b_l2hdr.b_daddr = le->le_daddr; exists->b_l2hdr.b_arcs_state = L2BLK_GET_STATE((le)->le_prop); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, exists); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(exists), exists); mutex_exit(&dev->l2ad_mtx); l2arc_hdr_arcstats_increment(exists); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); } ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); } mutex_exit(hash_lock); } /* * Starts an asynchronous read IO to read a log block. This is used in log * block reconstruction to start reading the next block before we are done * decoding and reconstructing the current block, to keep the l2arc device * nice and hot with read IO to process. * The returned zio will contain a newly allocated memory buffers for the IO * data which should then be freed by the caller once the zio is no longer * needed (i.e. due to it having completed). If you wish to abort this * zio, you should do so using l2arc_log_blk_fetch_abort, which takes * care of disposing of the allocated buffers correctly. */ static zio_t * l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, l2arc_log_blk_phys_t *lb) { uint32_t asize; zio_t *pio; l2arc_read_callback_t *cb; /* L2BLK_GET_PSIZE returns aligned size for log blocks */ asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); } /* * Aborts a zio returned from l2arc_log_blk_fetch and frees the data * buffers allocated for it. */ static void l2arc_log_blk_fetch_abort(zio_t *zio) { (void) zio_wait(zio); } /* * Creates a zio to update the device header on an l2arc device. */ void l2arc_dev_hdr_update(l2arc_dev_t *dev) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; int err; VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; l2dhdr->dh_log_entries = dev->l2ad_log_entries; l2dhdr->dh_evict = dev->l2ad_evict; l2dhdr->dh_start = dev->l2ad_start; l2dhdr->dh_end = dev->l2ad_end; l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); abd_free(abd); if (err != 0) { zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); } } /* * Commits a log block to the L2ARC device. This routine is invoked from * l2arc_write_buffers when the log block fills up. * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t psize, asize; zio_t *wzio; l2arc_lb_abd_buf_t *abd_buf; uint8_t *tmpbuf = NULL; l2arc_lb_ptr_buf_t *lb_ptr_buf; VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); abd_buf = zio_buf_alloc(sizeof (*abd_buf)); abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); /* link the buffer into the block chain */ lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; lb->lb_magic = L2ARC_LOG_BLK_MAGIC; /* * l2arc_log_blk_commit() may be called multiple times during a single * l2arc_write_buffers() call. Save the allocated abd buffers in a list * so we can free them in l2arc_write_done() later on. */ list_insert_tail(&cb->l2wcb_abd_list, abd_buf); /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0); /* a log block is never entirely zero */ ASSERT(psize != 0); asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(asize <= sizeof (*lb)); /* * Update the start log block pointer in the device header to point * to the log block we're about to write. */ l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; l2dhdr->dh_start_lbps[0].lbp_payload_asize = dev->l2ad_log_blk_payload_asize; l2dhdr->dh_start_lbps[0].lbp_payload_start = dev->l2ad_log_blk_payload_start; L2BLK_SET_LSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); L2BLK_SET_PSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); L2BLK_SET_CHECKSUM( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_CHECKSUM_FLETCHER_4); if (asize < sizeof (*lb)) { /* compression succeeded */ memset(tmpbuf + psize, 0, asize - psize); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_LZ4); } else { /* compression failed */ memcpy(tmpbuf, lb, sizeof (*lb)); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_OFF); } /* checksum what we're about to write */ fletcher_4_native(tmpbuf, asize, NULL, &l2dhdr->dh_start_lbps[0].lbp_cksum); abd_free(abd_buf->abd); /* perform the write itself */ abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); (void) zio_nowait(wzio); dev->l2ad_hand += asize; /* * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. */ memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_writes); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, dev->l2ad_log_blk_payload_asize / asize); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; return (asize); } /* * Validates an L2ARC log block address to make sure that it can be read * from the provided L2ARC device. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) { /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); uint64_t end = lbp->lbp_daddr + asize - 1; uint64_t start = lbp->lbp_payload_start; boolean_t evicted = B_FALSE; /* * A log block is valid if all of the following conditions are true: * - it fits entirely (including its payload) between l2ad_start and * l2ad_end * - it has a valid size * - neither the log block itself nor part of its payload was evicted * by l2arc_evict(): * * l2ad_hand l2ad_evict * | | lbp_daddr * | start | | end * | | | | | * V V V V V * l2ad_start ============================================ l2ad_end * --------------------------|||| * ^ ^ * | log block * payload */ evicted = l2arc_range_check_overlap(start, end, dev->l2ad_hand) || l2arc_range_check_overlap(start, end, dev->l2ad_evict) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); return (start >= dev->l2ad_start && end <= dev->l2ad_end && asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && (!evicted || dev->l2ad_first)); } /* * Inserts ARC buffer header `hdr' into the current L2ARC log block on * the device. The buffer being inserted must be present in L2ARC. * Returns B_TRUE if the L2ARC log block is full and needs to be committed * to L2ARC, or B_FALSE if it still has room for more ARC buffers. */ static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_log_ent_phys_t *le; if (dev->l2ad_log_entries == 0) return (B_FALSE); int index = dev->l2ad_log_ent_idx++; ASSERT3S(index, <, dev->l2ad_log_entries); ASSERT(HDR_HAS_L2HDR(hdr)); le = &lb->lb_entries[index]; memset(le, 0, sizeof (*le)); le->le_dva = hdr->b_dva; le->le_birth = hdr->b_birth; le->le_daddr = hdr->b_l2hdr.b_daddr; if (index == 0) dev->l2ad_log_blk_payload_start = le->le_daddr; L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); le->le_complevel = hdr->b_complevel; L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); } /* * Checks whether a given L2ARC device address sits in a time-sequential * range. The trick here is that the L2ARC is a rotary buffer, so we can't * just do a range comparison, we need to handle the situation in which the * range wraps around the end of the L2ARC device. Arguments: * bottom -- Lower end of the range to check (written to earlier). * top -- Upper end of the range to check (written to later). * check -- The address for which we want to determine if it sits in * between the top and bottom. * * The 3-way conditional below represents the following cases: * * bottom < top : Sequentially ordered case: * --------+-------------------+ * | (overlap here?) | * L2ARC dev V V * |---------------============--------------| * * bottom > top: Looped-around case: * --------+------------------+ * | (overlap here?) | * L2ARC dev V V * |===============---------------===========| * ^ ^ * | (or here?) | * +---------------+--------- * * top == bottom : Just a single address comparison. */ boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) { if (bottom < top) return (bottom <= check && check <= top); else if (bottom > top) return (check <= top || bottom <= check); else return (check == top); } EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW, "Balance between metadata and data on ghost hits."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, param_get_uint, ZMOD_RW, "Seconds before growing ARC size"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim ARC to"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD, "Target average block size"); ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, "Disable compressed ARC buffers"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prefetch block in ms"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prescient prefetched block in ms"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW, "Max write bytes per interval"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW, "Extra write bytes during device warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW, "Number of max device writes to precache"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW, "Compressed l2arc_headroom multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW, "TRIM ahead L2ARC write size multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW, "Seconds between L2ARC writing"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW, "Min feed interval in milliseconds"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW, "Skip caching prefetched buffers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW, "Turbo L2ARC warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW, "No reads during writes"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW, "Percent of ARC size allowed for L2ARC-only headers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, "Rebuild the L2ARC when importing a pool"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW, "Min size in bytes to write rebuild log blocks in L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, "Cache only MFU data from ARC into L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, "Exclude dbufs on special vdevs from being cached to L2ARC if set."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "Percent of ARC meta buffers for dnodes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW, "When full, ARC allocation waits for eviction of this % of alloc size"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW, "The number of headers to evict per sublist before moving to the next"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW, "Number of arc_prune threads"); diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index 272e712586fa..1ea075217fb1 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -1,5260 +1,5184 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static kstat_t *dbuf_ksp; typedef struct dbuf_stats { /* * Various statistics about the size of the dbuf cache. */ kstat_named_t cache_count; kstat_named_t cache_size_bytes; kstat_named_t cache_size_bytes_max; /* * Statistics regarding the bounds on the dbuf cache size. */ kstat_named_t cache_target_bytes; kstat_named_t cache_lowater_bytes; kstat_named_t cache_hiwater_bytes; /* * Total number of dbuf cache evictions that have occurred. */ kstat_named_t cache_total_evicts; /* * The distribution of dbuf levels in the dbuf cache and * the total size of all dbufs at each level. */ kstat_named_t cache_levels[DN_MAX_LEVELS]; kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; /* * Statistics about the dbuf hash table. */ kstat_named_t hash_hits; kstat_named_t hash_misses; kstat_named_t hash_collisions; kstat_named_t hash_elements; kstat_named_t hash_elements_max; /* * Number of sublists containing more than one dbuf in the dbuf * hash table. Keep track of the longest hash chain. */ kstat_named_t hash_chains; kstat_named_t hash_chain_max; /* * Number of times a dbuf_create() discovers that a dbuf was * already created and in the dbuf hash table. */ kstat_named_t hash_insert_race; /* * Number of entries in the hash table dbuf and mutex arrays. */ kstat_named_t hash_table_count; kstat_named_t hash_mutex_count; /* * Statistics about the size of the metadata dbuf cache. */ kstat_named_t metadata_cache_count; kstat_named_t metadata_cache_size_bytes; kstat_named_t metadata_cache_size_bytes_max; /* * For diagnostic purposes, this is incremented whenever we can't add * something to the metadata cache because it's full, and instead put * the data in the regular dbuf cache. */ kstat_named_t metadata_cache_overflow; } dbuf_stats_t; dbuf_stats_t dbuf_stats = { { "cache_count", KSTAT_DATA_UINT64 }, { "cache_size_bytes", KSTAT_DATA_UINT64 }, { "cache_size_bytes_max", KSTAT_DATA_UINT64 }, { "cache_target_bytes", KSTAT_DATA_UINT64 }, { "cache_lowater_bytes", KSTAT_DATA_UINT64 }, { "cache_hiwater_bytes", KSTAT_DATA_UINT64 }, { "cache_total_evicts", KSTAT_DATA_UINT64 }, { { "cache_levels_N", KSTAT_DATA_UINT64 } }, { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, { "hash_hits", KSTAT_DATA_UINT64 }, { "hash_misses", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "hash_insert_race", KSTAT_DATA_UINT64 }, { "hash_table_count", KSTAT_DATA_UINT64 }, { "hash_mutex_count", KSTAT_DATA_UINT64 }, { "metadata_cache_count", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, { "metadata_cache_overflow", KSTAT_DATA_UINT64 } }; struct { wmsum_t cache_count; wmsum_t cache_total_evicts; wmsum_t cache_levels[DN_MAX_LEVELS]; wmsum_t cache_levels_bytes[DN_MAX_LEVELS]; wmsum_t hash_hits; wmsum_t hash_misses; wmsum_t hash_collisions; wmsum_t hash_chains; wmsum_t hash_insert_race; wmsum_t metadata_cache_count; wmsum_t metadata_cache_overflow; } dbuf_sums; #define DBUF_STAT_INCR(stat, val) \ wmsum_add(&dbuf_sums.stat, val); #define DBUF_STAT_DECR(stat, val) \ DBUF_STAT_INCR(stat, -(val)); #define DBUF_STAT_BUMP(stat) \ DBUF_STAT_INCR(stat, 1); #define DBUF_STAT_BUMPDOWN(stat) \ DBUF_STAT_INCR(stat, -1); #define DBUF_STAT_MAX(stat, v) { \ uint64_t _m; \ while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\ continue; \ } static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags); /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; static kthread_t *dbuf_cache_evict_thread; static kmutex_t dbuf_evict_lock; static kcondvar_t dbuf_evict_cv; static boolean_t dbuf_evict_thread_exit; /* * There are two dbuf caches; each dbuf can only be in one of them at a time. * * 1. Cache of metadata dbufs, to help make read-heavy administrative commands * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs * that represent the metadata that describes filesystems/snapshots/ * bookmarks/properties/etc. We only evict from this cache when we export a * pool, to short-circuit as much I/O as possible for all administrative * commands that need the metadata. There is no eviction policy for this * cache, because we try to only include types in it which would occupy a * very small amount of space per object but create a large impact on the * performance of these commands. Instead, after it reaches a maximum size * (which should only happen on very small memory systems with a very large * number of filesystem objects), we stop taking new dbufs into the * metadata cache, instead putting them in the normal dbuf cache. * * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that * are not currently held but have been recently released. These dbufs * are not eligible for arc eviction until they are aged out of the cache. * Dbufs that are aged out of the cache will be immediately destroyed and * become eligible for arc eviction. * * Dbufs are added to these caches once the last hold is released. If a dbuf is * later accessed and still exists in the dbuf cache, then it will be removed * from the cache and later re-added to the head of the cache. * * If a given dbuf meets the requirements for the metadata cache, it will go * there, otherwise it will be considered for the generic LRU dbuf cache. The * caches and the refcounts tracking their sizes are stored in an array indexed * by those caches' matching enum values (from dbuf_cached_state_t). */ typedef struct dbuf_cache { multilist_t cache; zfs_refcount_t size ____cacheline_aligned; } dbuf_cache_t; dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; /* Size limits for the caches */ static uint64_t dbuf_cache_max_bytes = UINT64_MAX; static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX; /* Set the default sizes of the caches to log2 fraction of arc size */ static uint_t dbuf_cache_shift = 5; static uint_t dbuf_metadata_cache_shift = 6; /* Set the dbuf hash mutex count as log2 shift (dynamic by default) */ static uint_t dbuf_mutex_cache_shift = 0; static unsigned long dbuf_cache_target_bytes(void); static unsigned long dbuf_metadata_cache_target_bytes(void); /* * The LRU dbuf cache uses a three-stage eviction policy: * - A low water marker designates when the dbuf eviction thread * should stop evicting from the dbuf cache. * - When we reach the maximum size (aka mid water mark), we * signal the eviction thread to run. * - The high water mark indicates when the eviction thread * is unable to keep up with the incoming load and eviction must * happen in the context of the calling thread. * * The dbuf cache: * (max size) * low water mid water hi water * +----------------------------------------+----------+----------+ * | | | | * | | | | * | | | | * | | | | * +----------------------------------------+----------+----------+ * stop signal evict * evicting eviction directly * thread * * The high and low water marks indicate the operating range for the eviction * thread. The low water mark is, by default, 90% of the total size of the * cache and the high water mark is at 110% (both of these percentages can be * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, * respectively). The eviction thread will try to ensure that the cache remains * within this range by waking up every second and checking if the cache is * above the low water mark. The thread can also be woken up by callers adding * elements into the cache if the cache is larger than the mid water (i.e max * cache size). Once the eviction thread is woken up and eviction is required, * it will continue evicting buffers until it's able to reduce the cache size * to the low water mark. If the cache size continues to grow and hits the high * water mark, then callers adding elements to the cache will begin to evict * directly from the cache until the cache is no longer above the high water * mark. */ /* * The percentage above and below the maximum cache size. */ static uint_t dbuf_cache_hiwater_pct = 10; static uint_t dbuf_cache_lowater_pct = 10; static int dbuf_cons(void *vdb, void *unused, int kmflag) { (void) unused, (void) kmflag; dmu_buf_impl_t *db = vdb; memset(db, 0, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); zfs_refcount_create(&db->db_holds); return (0); } static void dbuf_dest(void *vdb, void *unused) { (void) unused; dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); rw_destroy(&db->db_rwlock); cv_destroy(&db->db_changed); ASSERT(!multilist_link_active(&db->db_cache_link)); zfs_refcount_destroy(&db->db_holds); } /* * dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) { return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid)); } #define DTRACE_SET_STATE(db, why) \ DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db, \ const char *, why) #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ (dbuf)->db_level == (level) && \ (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid, uint64_t *hash_out) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv; uint64_t idx; dmu_buf_impl_t *db; hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } mutex_exit(DBUF_HASH_MUTEX(h, idx)); if (hash_out != NULL) *hash_out = hv; return (NULL); } static dmu_buf_impl_t * dbuf_find_bonus(objset_t *os, uint64_t object) { dnode_t *dn; dmu_buf_impl_t *db = NULL; if (dnode_hold(os, object, FTAG, &dn) == 0) { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus != NULL) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } return (db); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid, idx; dmu_buf_impl_t *dbf; uint32_t i; blkid = db->db_blkid; ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash); idx = db->db_hash & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx], i = 0; dbf != NULL; dbf = dbf->db_hash_next, i++) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); } } if (i > 0) { DBUF_STAT_BUMP(hash_collisions); if (i == 1) DBUF_STAT_BUMP(hash_chains); DBUF_STAT_MAX(hash_chain_max, i); } mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64); DBUF_STAT_MAX(hash_elements_max, he); return (NULL); } /* * This returns whether this dbuf should be stored in the metadata cache, which * is based on whether it's from one of the dnode types that store data related * to traversing dataset hierarchies. */ static boolean_t dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) { DB_DNODE_ENTER(db); dmu_object_type_t type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); /* Check if this dbuf is one of the types we care about */ if (DMU_OT_IS_METADATA_CACHED(type)) { /* If we hit this, then we set something up wrong in dmu_ot */ ASSERT(DMU_OT_IS_METADATA(type)); /* * Sanity check for small-memory systems: don't allocate too * much memory for this purpose. */ if (zfs_refcount_count( &dbuf_caches[DB_DBUF_METADATA_CACHE].size) > dbuf_metadata_cache_target_bytes()) { DBUF_STAT_BUMP(metadata_cache_overflow); return (B_FALSE); } return (B_TRUE); } return (B_FALSE); } /* * Remove an entry from the hash table. It must be in the EVICTING state. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t idx; dmu_buf_impl_t *dbf, **dbp; ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid), ==, db->db_hash); idx = db->db_hash & h->hash_table_mask; /* * We mustn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; ASSERT(dbf != NULL); } *dbp = db->db_hash_next; db->db_hash_next = NULL; if (h->hash_table[idx] && h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_stats.hash_elements.value.ui64); } typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING } dbvu_verify_type_t; static void dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) { #ifdef ZFS_DEBUG int64_t holds; if (db->db_user == NULL) return; /* Only data blocks support the attachment of user data. */ ASSERT(db->db_level == 0); /* Clients must resolve a dbuf before attaching user data. */ ASSERT(db->db.db_data != NULL); ASSERT3U(db->db_state, ==, DB_CACHED); holds = zfs_refcount_count(&db->db_holds); if (verify_type == DBVU_EVICTING) { /* * Immediate eviction occurs when holds == dirtycnt. * For normal eviction buffers, holds is zero on * eviction, except when dbuf_fix_old_data() calls * dbuf_clear_data(). However, the hold count can grow * during eviction even though db_mtx is held (see * dmu_bonus_hold() for an example), so we can only * test the generic invariant that holds >= dirtycnt. */ ASSERT3U(holds, >=, db->db_dirtycnt); } else { if (db->db_user_immediate_evict == TRUE) ASSERT3U(holds, >=, db->db_dirtycnt); else ASSERT3U(holds, >, 0); } #endif } static void dbuf_evict_user(dmu_buf_impl_t *db) { dmu_buf_user_t *dbu = db->db_user; ASSERT(MUTEX_HELD(&db->db_mtx)); if (dbu == NULL) return; dbuf_verify_user(db, DBVU_EVICTING); db->db_user = NULL; #ifdef ZFS_DEBUG if (dbu->dbu_clear_on_evict_dbufp != NULL) *dbu->dbu_clear_on_evict_dbufp = NULL; #endif /* * There are two eviction callbacks - one that we call synchronously * and one that we invoke via a taskq. The async one is useful for * avoiding lock order reversals and limiting stack depth. * * Note that if we have a sync callback but no async callback, * it's likely that the sync callback will free the structure * containing the dbu. In that case we need to take care to not * dereference dbu after calling the sync evict func. */ boolean_t has_async = (dbu->dbu_evict_func_async != NULL); if (dbu->dbu_evict_func_sync != NULL) dbu->dbu_evict_func_sync(dbu); if (has_async) { taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, dbu, 0, &dbu->dbu_tqent); } } boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { /* * Consider indirect blocks and spill blocks to be meta data. */ if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) { return (B_TRUE); } else { boolean_t is_metadata; DB_DNODE_ENTER(db); is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); } } /* * We want to exclude buffers that are on a special allocation class from * L2ARC. */ boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db) { if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL || (db->db_objset->os_secondary_cache == ZFS_CACHE_METADATA && dbuf_is_metadata(db))) { if (l2arc_exclude_special == 0) return (B_TRUE); blkptr_t *bp = db->db_blkptr; if (bp == NULL || BP_IS_HOLE(bp)) return (B_FALSE); uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev; vdev_t *vd = NULL; if (vdev < rvd->vdev_children) vd = rvd->vdev_child[vdev]; if (vd == NULL) return (B_TRUE); if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) return (B_TRUE); } return (B_FALSE); } static inline boolean_t dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level) { if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA && (level > 0 || DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) { if (l2arc_exclude_special == 0) return (B_TRUE); if (bp == NULL || BP_IS_HOLE(bp)) return (B_FALSE); uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev; vdev_t *vd = NULL; if (vdev < rvd->vdev_children) vd = rvd->vdev_child[vdev]; if (vd == NULL) return (B_TRUE); if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) return (B_TRUE); } return (B_FALSE); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the dbuf eviction * code is laid out; dbuf_evict_thread() assumes dbufs are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ static unsigned int dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { dmu_buf_impl_t *db = obj; /* * The assumption here, is the hash value for a given * dmu_buf_impl_t will remain constant throughout it's lifetime * (i.e. it's objset, object, level and blkid fields don't change). * Thus, we don't need to store the dbuf's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. In this context full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid) % multilist_get_num_sublists(ml)); } /* * The target size of the dbuf cache can grow with the ARC target, * unless limited by the tunable dbuf_cache_max_bytes. */ static inline unsigned long dbuf_cache_target_bytes(void) { return (MIN(dbuf_cache_max_bytes, arc_target_bytes() >> dbuf_cache_shift)); } /* * The target size of the dbuf metadata cache can grow with the ARC target, * unless limited by the tunable dbuf_metadata_cache_max_bytes. */ static inline unsigned long dbuf_metadata_cache_target_bytes(void) { return (MIN(dbuf_metadata_cache_max_bytes, arc_target_bytes() >> dbuf_metadata_cache_shift)); } static inline uint64_t dbuf_cache_hiwater_bytes(void) { uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); return (dbuf_cache_target + (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100); } static inline uint64_t dbuf_cache_lowater_bytes(void) { uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); return (dbuf_cache_target - (dbuf_cache_target * dbuf_cache_lowater_pct) / 100); } static inline boolean_t dbuf_cache_above_lowater(void) { return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_lowater_bytes()); } /* * Evict the oldest eligible dbuf from the dbuf cache. */ static void dbuf_evict_one(void) { int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); multilist_sublist_t *mls = multilist_sublist_lock( &dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); dmu_buf_impl_t *db = multilist_sublist_tail(mls); while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { db = multilist_sublist_prev(mls, db); } DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, multilist_sublist_t *, mls); if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); (void) zfs_refcount_remove_many( &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); DBUF_STAT_BUMP(cache_total_evicts); } else { multilist_sublist_unlock(mls); } } /* * The dbuf evict thread is responsible for aging out dbufs from the * cache. Once the cache has reached it's maximum size, dbufs are removed * and destroyed. The eviction thread will continue running until the size * of the dbuf cache is at or below the maximum size. Once the dbuf is aged * out of the cache it is destroyed and becomes eligible for arc eviction. */ static __attribute__((noreturn)) void dbuf_evict_thread(void *unused) { (void) unused; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); mutex_enter(&dbuf_evict_lock); while (!dbuf_evict_thread_exit) { while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_idle_hires(&dbuf_evict_cv, &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); /* * Keep evicting as long as we're above the low water mark * for the cache. We do this without holding the locks to * minimize lock contention. */ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { dbuf_evict_one(); } mutex_enter(&dbuf_evict_lock); } dbuf_evict_thread_exit = B_FALSE; cv_broadcast(&dbuf_evict_cv); CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ thread_exit(); } /* * Wake up the dbuf eviction thread if the dbuf cache is at its max size. * If the dbuf cache is at its high water mark, then evict a dbuf from the * dbuf cache using the caller's context. */ static void dbuf_evict_notify(uint64_t size) { /* * We check if we should evict without holding the dbuf_evict_lock, * because it's OK to occasionally make the wrong decision here, * and grabbing the lock results in massive lock contention. */ if (size > dbuf_cache_target_bytes()) { if (size > dbuf_cache_hiwater_bytes()) dbuf_evict_one(); cv_signal(&dbuf_evict_cv); } } static int dbuf_kstat_update(kstat_t *ksp, int rw) { dbuf_stats_t *ds = ksp->ks_data; dbuf_hash_table_t *h = &dbuf_hash_table; if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); ds->cache_count.value.ui64 = wmsum_value(&dbuf_sums.cache_count); ds->cache_size_bytes.value.ui64 = zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); ds->cache_total_evicts.value.ui64 = wmsum_value(&dbuf_sums.cache_total_evicts); for (int i = 0; i < DN_MAX_LEVELS; i++) { ds->cache_levels[i].value.ui64 = wmsum_value(&dbuf_sums.cache_levels[i]); ds->cache_levels_bytes[i].value.ui64 = wmsum_value(&dbuf_sums.cache_levels_bytes[i]); } ds->hash_hits.value.ui64 = wmsum_value(&dbuf_sums.hash_hits); ds->hash_misses.value.ui64 = wmsum_value(&dbuf_sums.hash_misses); ds->hash_collisions.value.ui64 = wmsum_value(&dbuf_sums.hash_collisions); ds->hash_chains.value.ui64 = wmsum_value(&dbuf_sums.hash_chains); ds->hash_insert_race.value.ui64 = wmsum_value(&dbuf_sums.hash_insert_race); ds->hash_table_count.value.ui64 = h->hash_table_mask + 1; ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1; ds->metadata_cache_count.value.ui64 = wmsum_value(&dbuf_sums.metadata_cache_count); ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count( &dbuf_caches[DB_DBUF_METADATA_CACHE].size); ds->metadata_cache_overflow.value.ui64 = wmsum_value(&dbuf_sums.metadata_cache_overflow); return (0); } void dbuf_init(void) { uint64_t hmsize, hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; /* * The hash table is big enough to fill one eighth of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8) hsize <<= 1; h->hash_table = NULL; while (h->hash_table == NULL) { h->hash_table_mask = hsize - 1; h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); if (h->hash_table == NULL) hsize >>= 1; ASSERT3U(hsize, >=, 1ULL << 10); } /* * The hash table buckets are protected by an array of mutexes where * each mutex is reponsible for protecting 128 buckets. A minimum * array size of 8192 is targeted to avoid contention. */ if (dbuf_mutex_cache_shift == 0) hmsize = MAX(hsize >> 7, 1ULL << 13); else hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24); h->hash_mutexes = NULL; while (h->hash_mutexes == NULL) { h->hash_mutex_mask = hmsize - 1; h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t), KM_SLEEP); if (h->hash_mutexes == NULL) hmsize >>= 1; } dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (int i = 0; i < hmsize; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { multilist_create(&dbuf_caches[dcs].cache, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_cache_link), dbuf_cache_multilist_index_func); zfs_refcount_create(&dbuf_caches[dcs].size); } dbuf_evict_thread_exit = B_FALSE; mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, NULL, 0, &p0, TS_RUN, minclsyspri); wmsum_init(&dbuf_sums.cache_count, 0); wmsum_init(&dbuf_sums.cache_total_evicts, 0); for (int i = 0; i < DN_MAX_LEVELS; i++) { wmsum_init(&dbuf_sums.cache_levels[i], 0); wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0); } wmsum_init(&dbuf_sums.hash_hits, 0); wmsum_init(&dbuf_sums.hash_misses, 0); wmsum_init(&dbuf_sums.hash_collisions, 0); wmsum_init(&dbuf_sums.hash_chains, 0); wmsum_init(&dbuf_sums.hash_insert_race, 0); wmsum_init(&dbuf_sums.metadata_cache_count, 0); wmsum_init(&dbuf_sums.metadata_cache_overflow, 0); dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc", KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dbuf_ksp != NULL) { for (int i = 0; i < DN_MAX_LEVELS; i++) { snprintf(dbuf_stats.cache_levels[i].name, KSTAT_STRLEN, "cache_level_%d", i); dbuf_stats.cache_levels[i].data_type = KSTAT_DATA_UINT64; snprintf(dbuf_stats.cache_levels_bytes[i].name, KSTAT_STRLEN, "cache_level_%d_bytes", i); dbuf_stats.cache_levels_bytes[i].data_type = KSTAT_DATA_UINT64; } dbuf_ksp->ks_data = &dbuf_stats; dbuf_ksp->ks_update = dbuf_kstat_update; kstat_install(dbuf_ksp); } } void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; dbuf_stats_destroy(); for (int i = 0; i < (h->hash_mutex_mask + 1); i++) mutex_destroy(&h->hash_mutexes[i]); vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) * sizeof (kmutex_t)); kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); mutex_enter(&dbuf_evict_lock); dbuf_evict_thread_exit = B_TRUE; while (dbuf_evict_thread_exit) { cv_signal(&dbuf_evict_cv); cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); mutex_destroy(&dbuf_evict_lock); cv_destroy(&dbuf_evict_cv); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { zfs_refcount_destroy(&dbuf_caches[dcs].size); multilist_destroy(&dbuf_caches[dcs].cache); } if (dbuf_ksp != NULL) { kstat_delete(dbuf_ksp); dbuf_ksp = NULL; } wmsum_fini(&dbuf_sums.cache_count); wmsum_fini(&dbuf_sums.cache_total_evicts); for (int i = 0; i < DN_MAX_LEVELS; i++) { wmsum_fini(&dbuf_sums.cache_levels[i]); wmsum_fini(&dbuf_sums.cache_levels_bytes[i]); } wmsum_fini(&dbuf_sums.hash_hits); wmsum_fini(&dbuf_sums.hash_misses); wmsum_fini(&dbuf_sums.hash_collisions); wmsum_fini(&dbuf_sums.hash_chains); wmsum_fini(&dbuf_sums.hash_insert_race); wmsum_fini(&dbuf_sums.metadata_cache_count); wmsum_fini(&dbuf_sums.metadata_cache_overflow); } /* * Other stuff. */ #ifdef ZFS_DEBUG static void dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; uint32_t txg_prev; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) return; ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } if ((dr = list_head(&db->db_dirty_records)) != NULL) { ASSERT(dr->dr_dbuf == db); txg_prev = dr->dr_txg; for (dr = list_next(&db->db_dirty_records, dr); dr != NULL; dr = list_next(&db->db_dirty_records, dr)) { ASSERT(dr->dr_dbuf == db); ASSERT(txg_prev > dr->dr_txg); txg_prev = dr->dr_txg; } } /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. */ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ if (db->db_blkptr) { if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ int epb __maybe_unused = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't * have the parent's rwlock. XXX indblksz no longer * grows. safe to do this now? */ if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); } } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. * * There is an exception to this rule for indirect blocks; in * this case, if the indirect block is a hole, we fill in a few * fields on each of the child blocks (importantly, birth time) * to prevent hole birth times from being lost when you * partially fill in a hole. */ if (db->db_dirtycnt == 0) { if (db->db_level == 0) { uint64_t *buf = db->db.db_data; int i; for (i = 0; i < db->db.db_size >> 3; i++) { ASSERT(buf[i] == 0); } } else { blkptr_t *bps = db->db.db_data; ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, db->db.db_size); /* * We want to verify that all the blkptrs in the * indirect block are holes, but we may have * automatically set up a few fields for them. * We iterate through each blkptr and verify * they only have those fields set. */ for (int i = 0; i < db->db.db_size / sizeof (blkptr_t); i++) { blkptr_t *bp = &bps[i]; ASSERT(ZIO_CHECKSUM_IS_ZERO( &bp->blk_cksum)); ASSERT( DVA_IS_EMPTY(&bp->blk_dva[0]) && DVA_IS_EMPTY(&bp->blk_dva[1]) && DVA_IS_EMPTY(&bp->blk_dva[2])); ASSERT0(bp->blk_fill); ASSERT0(bp->blk_pad[0]); ASSERT0(bp->blk_pad[1]); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(BP_IS_HOLE(bp)); ASSERT0(bp->blk_phys_birth); } } } } DB_DNODE_EXIT(db); } #endif static void dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) { db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "clear data"); } } static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; } static arc_buf_t * dbuf_alloc_arcbuf(dmu_buf_impl_t *db) { spa_t *spa = db->db_objset->os_spa; return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size)); } /* * Loan out an arc_buf for read. Return the loaned arc_buf. */ arc_buf_t * dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, B_FALSE, blksz); memcpy(abuf->b_data, db->db.db_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); } /* * Calculate which level n block references the data at the level 0 offset * provided. */ uint64_t dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset) { if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { /* * The level n blkid is equal to the level 0 blkid divided by * the number of level 0s in a level n block. * * The level 0 blkid is offset >> datablkshift = * offset / 2^datablkshift. * * The number of level 0s in a level n is the number of block * pointers in an indirect block, raised to the power of level. * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). * * Thus, the level n blkid is: offset / * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT)))) * = offset / 2^(datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) * = offset >> (datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) */ const unsigned exp = dn->dn_datablkshift + level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT); if (exp >= 8 * sizeof (offset)) { /* This only happens on the highest indirection level */ ASSERT3U(level, ==, dn->dn_nlevels - 1); return (0); } ASSERT3U(exp, <, 8 * sizeof (offset)); return (offset >> exp); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); } } /* * This function is used to lock the parent of the provided dbuf. This should be * used when modifying or reading db_blkptr. */ db_lock_type_t dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag) { enum db_lock_type ret = DLT_NONE; if (db->db_parent != NULL) { rw_enter(&db->db_parent->db_rwlock, rw); ret = DLT_PARENT; } else if (dmu_objset_ds(db->db_objset) != NULL) { rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw, tag); ret = DLT_OBJSET; } /* * We only return a DLT_NONE lock when it's the top-most indirect block * of the meta-dnode of the MOS. */ return (ret); } /* * We need to pass the lock type in because it's possible that the block will * move from being the topmost indirect block in a dnode (and thus, have no * parent) to not the top-most via an indirection increase. This would cause a * panic if we didn't pass the lock type in. */ void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag) { if (type == DLT_PARENT) rw_exit(&db->db_parent->db_rwlock); else if (type == DLT_OBJSET) rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag); } static void dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *vdb) { (void) zb, (void) bp; dmu_buf_impl_t *db = vdb; mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); /* * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(zfs_refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); if (buf == NULL) { /* i/o error */ ASSERT(zio == NULL || zio->io_error != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "i/o error"); } else if (db->db_level == 0 && db->db_freed_in_flight) { /* freed in flight */ ASSERT(zio == NULL || zio->io_error == 0); arc_release(buf, db); memset(buf->b_data, 0, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "freed in flight"); } else { /* success */ ASSERT(zio == NULL || zio->io_error == 0); dbuf_set_data(db, buf); db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "successful read"); } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL, B_FALSE); } /* * Shortcut for performing reads on bonus dbufs. Returns * an error if we fail to verify the dnode associated with * a decrypted block. Otherwise success. */ static int dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) { int bonuslen, max_bonuslen, err; err = dbuf_read_verify_dnode_crypt(db, flags); if (err) return (err); bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(DB_DNODE_HELD(db)); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) memset(db->db.db_data, 0, max_bonuslen); if (bonuslen) memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen); db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "bonus buffer filled"); return (0); } static void dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp) { blkptr_t *bps = db->db.db_data; uint32_t indbs = 1ULL << dn->dn_indblkshift; int n_bps = indbs >> SPA_BLKPTRSHIFT; for (int i = 0; i < n_bps; i++) { blkptr_t *bp = &bps[i]; ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs); BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ? dn->dn_datablksz : BP_GET_LSIZE(dbbp)); BP_SET_TYPE(bp, BP_GET_TYPE(dbbp)); BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1); BP_SET_BIRTH(bp, dbbp->blk_birth, 0); } } /* * Handle reads on dbufs that are holes, if necessary. This function * requires that the dbuf's mutex is held. Returns success (0) if action * was taken, ENOENT if no action was taken. */ static int dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp) { ASSERT(MUTEX_HELD(&db->db_mtx)); int is_hole = bp == NULL || BP_IS_HOLE(bp); /* * For level 0 blocks only, if the above check fails: * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ if (!is_hole && db->db_level == 0) is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp); if (is_hole) { dbuf_set_data(db, dbuf_alloc_arcbuf(db)); memset(db->db.db_data, 0, db->db.db_size); if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) && bp->blk_birth != 0) { dbuf_handle_indirect_hole(db, dn, bp); } db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "hole read satisfied"); return (0); } return (ENOENT); } /* * This function ensures that, when doing a decrypting read of a block, * we make sure we have decrypted the dnode associated with it. We must do * this so that we ensure we are fully authenticating the checksum-of-MACs * tree from the root of the objset down to this block. Indirect blocks are * always verified against their secure checksum-of-MACs assuming that the * dnode containing them is correct. Now that we are doing a decrypting read, * we can be sure that the key is loaded and verify that assumption. This is * especially important considering that we always read encrypted dnode * blocks as raw data (without verifying their MACs) to start, and * decrypt / authenticate them when we need to read an encrypted bonus buffer. */ static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) { int err = 0; objset_t *os = db->db_objset; arc_buf_t *dnode_abuf; dnode_t *dn; zbookmark_phys_t zb; ASSERT(MUTEX_HELD(&db->db_mtx)); if ((flags & DB_RF_NO_DECRYPT) != 0 || !os->os_encrypted || os->os_raw_receive) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL; if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) { DB_DNODE_EXIT(db); return (0); } SET_BOOKMARK(&zb, dmu_objset_id(os), DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid); err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE); /* * An error code of EACCES tells us that the key is still not * available. This is ok if we are only reading authenticated * (and therefore non-encrypted) blocks. */ if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID && !DMU_OT_IS_ENCRYPTED(dn->dn_type)) || (db->db_blkid == DMU_BONUS_BLKID && !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)))) err = 0; DB_DNODE_EXIT(db); return (err); } /* * Drops db_mtx and the parent lock specified by dblt and tag before * returning. */ static int dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, db_lock_type_t dblt, const void *tag) { dnode_t *dn; zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; blkptr_t bp, *bpp; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_buf == NULL); ASSERT(db->db_parent == NULL || RW_LOCK_HELD(&db->db_parent->db_rwlock)); if (db->db_blkid == DMU_BONUS_BLKID) { err = dbuf_read_bonus(db, dn, flags); goto early_unlock; } if (db->db_state == DB_UNCACHED) { if (db->db_blkptr == NULL) { bpp = NULL; } else { bp = *db->db_blkptr; bpp = &bp; } } else { dbuf_dirty_record_t *dr; ASSERT3S(db->db_state, ==, DB_NOFILL); /* * Block cloning: If we have a pending block clone, * we don't want to read the underlying block, but the content * of the block being cloned, so we have the most recent data. */ dr = list_head(&db->db_dirty_records); if (dr == NULL || !dr->dt.dl.dr_brtwrite) { err = EIO; goto early_unlock; } bp = dr->dt.dl.dr_overridden_by; bpp = &bp; } err = dbuf_read_hole(db, dn, bpp); if (err == 0) goto early_unlock; ASSERT(bpp != NULL); /* * Any attempt to read a redacted block should result in an error. This * will never happen under normal conditions, but can be useful for * debugging purposes. */ if (BP_IS_REDACTED(bpp)) { ASSERT(dsl_dataset_feature_is_active( db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); err = SET_ERROR(EIO); goto early_unlock; } SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), db->db.db_object, db->db_level, db->db_blkid); /* * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth); zfs_panic_recover("unencrypted block in encrypted " "object set %llu", dmu_objset_id(db->db_objset)); err = SET_ERROR(EIO); goto early_unlock; } err = dbuf_read_verify_dnode_crypt(db, flags); if (err != 0) goto early_unlock; DB_DNODE_EXIT(db); db->db_state = DB_READ; DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); if (!DBUF_IS_CACHEABLE(db)) aflags |= ARC_FLAG_UNCACHED; else if (dbuf_is_l2cacheable(db)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); zio_flags = (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) zio_flags |= ZIO_FLAG_RAW; /* * The zio layer will copy the provided blkptr later, but we have our * own copy so that we can release the parent's rwlock. We have to * do that so that if dbuf_read_done is called synchronously (on * an l1 cache hit) we don't acquire the db_mtx while holding the * parent's rwlock, which would be a lock ordering violation. */ dmu_buf_unlock_parent(db, dblt, tag); (void) arc_read(zio, db->db_objset->os_spa, bpp, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); return (err); early_unlock: DB_DNODE_EXIT(db); mutex_exit(&db->db_mtx); dmu_buf_unlock_parent(db, dblt, tag); return (err); } /* * This is our just-in-time copy function. It makes a copy of buffers that * have been modified in a previous transaction group before we access them in * the current active group. * * This function is used in three places: when we are dirtying a buffer for the * first time in a txg, when we are freeing a range in a dnode that includes * this buffer, and when we are accessing a buffer which was received compressed * and later referenced in a WRITE_BYREF record. * * Note that when we are called from dbuf_free_range() we do not put a hold on * the buffer, we just traverse the active dbuf list for the dnode. */ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); ASSERT(db->db_level == 0); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || (dr->dt.dl.dr_data != ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT3U(dr->dr_txg, >=, txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn = DB_DNODE(db); int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP); arc_space_consume(bonuslen, ARC_SPACE_BONUS); memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen); } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { dnode_t *dn = DB_DNODE(db); int size = arc_buf_size(db->db_buf); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; enum zio_compress compress_type = arc_get_compression(db->db_buf); uint8_t complevel = arc_get_complevel(db->db_buf); if (arc_is_encrypted(db->db_buf)) { boolean_t byteorder; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; arc_get_raw_params(db->db_buf, &byteorder, salt, iv, mac); dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db, dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac, dn->dn_type, size, arc_buf_lsize(db->db_buf), compress_type, complevel); } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, size, arc_buf_lsize(db->db_buf), compress_type, complevel); } else { dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size); } else { db->db_buf = NULL; dbuf_clear_data(db); } } int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; boolean_t prefetch; dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it * can't be freed while we have a hold on the buffer. */ ASSERT(!zfs_refcount_is_zero(&db->db_holds)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL; mutex_enter(&db->db_mtx); if (flags & DB_RF_PARTIAL_FIRST) db->db_partial_read = B_TRUE; else if (!(flags & DB_RF_PARTIAL_MORE)) db->db_partial_read = B_FALSE; if (db->db_state == DB_CACHED) { /* * Ensure that this block's dnode has been decrypted if * the caller has requested decrypted data. */ err = dbuf_read_verify_dnode_crypt(db, flags); /* * If the arc buf is compressed or encrypted and the caller * requested uncompressed data, we need to untransform it * before returning. We also call arc_untransform() on any * unauthenticated blocks, which will verify their MAC if * the key is now available. */ if (err == 0 && db->db_buf != NULL && (flags & DB_RF_NO_DECRYPT) == 0 && (arc_is_encrypted(db->db_buf) || arc_is_unauthenticated(db->db_buf) || arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { spa_t *spa = dn->dn_objset->os_spa; zbookmark_phys_t zb; SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), db->db.db_object, db->db_level, db->db_blkid); dbuf_fix_old_data(db, spa_syncing_txg(spa)); err = arc_untransform(db->db_buf, spa, &zb, B_FALSE); dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); if (err == 0 && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, B_FALSE, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_hits); } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) { boolean_t need_wait = B_FALSE; db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); if (zio == NULL && (db->db_state == DB_NOFILL || (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) { spa_t *spa = dn->dn_objset->os_spa; zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } err = dbuf_read_impl(db, zio, flags, dblt, FTAG); /* * dbuf_read_impl has dropped db_mtx and our parent's rwlock * for us */ if (!err && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, db->db_state != DB_CACHED, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); /* * If we created a zio_root we must execute it to avoid * leaking it, even if it isn't attached to any work due * to an error in dbuf_read_impl(). */ if (need_wait) { if (err == 0) err = zio_wait(zio); else VERIFY0(zio_wait(zio)); } } else { /* * Another reader came in while the dbuf was in flight * between UNCACHED and CACHED. Either a writer will finish * writing the buffer (sending the dbuf to CACHED) or the * first reader's request will reach the read_done callback * and send the dbuf to CACHED. Otherwise, a failure * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); if (prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, B_TRUE, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); /* Skip the wait per the caller's request. */ if ((flags & DB_RF_NEVERWAIT) == 0) { mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, zio_t *, zio); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); } } return (err); } static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); dbuf_set_data(db, dbuf_alloc_arcbuf(db)); db->db_state = DB_FILL; DTRACE_SET_STATE(db, "assigning filled buffer"); } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } mutex_exit(&db->db_mtx); } void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; boolean_t release; ASSERT(MUTEX_HELD(&db->db_mtx)); /* * This assert is valid because dmu_sync() expects to be called by * a zilog's get_data while holding a range lock. This call only * comes from dbuf_dirty() callers who must also hold a range lock. */ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; ASSERT(db->db_data_pending != dr); /* free this block */ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); release = !dr->dt.dl.dr_brtwrite; dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; dr->dt.dl.dr_brtwrite = B_FALSE; dr->dt.dl.dr_has_raw_params = B_FALSE; /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do * another (redundant) arc_release(). Therefore, leave * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ if (release) arc_release(dr->dt.dl.dr_data, db); } /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. */ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { dmu_buf_impl_t *db_search; dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; avl_index_t where; dbuf_dirty_record_t *dr; if (end_blkid > dn->dn_maxblkid && !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) end_blkid = dn->dn_maxblkid; dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid, (u_longlong_t)end_blkid); db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); db_search->db_level = 0; db_search->db_blkid = start_blkid; db_search->db_state = DB_SEARCH; mutex_enter(&dn->dn_dbufs_mtx); db = avl_find(&dn->dn_dbufs, db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = db_next) { db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level != 0 || db->db_blkid > end_blkid) { break; } ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); if (dbuf_undirty(db, tx)) { /* mutex has been dropped and dbuf destroyed */ continue; } if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); continue; } if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } if (zfs_refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); dbuf_destroy(db); continue; } /* The dbuf is referenced */ dr = list_head(&db->db_dirty_records); if (dr != NULL) { if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file * size to reflect that this buffer may * contain new data when we sync. */ if (db->db_blkid != DMU_SPILL_BLKID && db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { /* * This dbuf is not dirty in the open context. * Either uncache it (if its not referenced in * the open context) or reset its contents to * empty. */ dbuf_fix_old_data(db, txg); } } /* clear the contents if its cached */ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); rw_enter(&db->db_rwlock, RW_WRITER); memset(db->db.db_data, 0, db->db.db_size); rw_exit(&db->db_rwlock); arc_buf_freeze(db->db_buf); } mutex_exit(&db->db_mtx); } mutex_exit(&dn->dn_dbufs_mtx); kmem_free(db_search, sizeof (dmu_buf_impl_t)); } void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { arc_buf_t *buf, *old_buf; dbuf_dirty_record_t *dr; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); /* copy old block data to the new block */ old_buf = db->db_buf; memcpy(buf->b_data, old_buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) memset((uint8_t *)buf->b_data + osize, 0, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); arc_buf_destroy(old_buf, db); db->db.db_size = size; dr = list_head(&db->db_dirty_records); /* dirty record added by dmu_buf_will_dirty() */ VERIFY(dr != NULL); if (db->db_level == 0) dr->dt.dl.dr_data = buf; ASSERT3U(dr->dr_txg, ==, tx->tx_txg); ASSERT3U(dr->dr_accounted, ==, osize); dr->dr_accounted = size; mutex_exit(&db->db_mtx); dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { objset_t *os __maybe_unused = db->db_objset; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); (void) arc_release(db->db_buf, db); } /* * We already have a dirty record for this TXG, and we are being * dirtied again. */ static void dbuf_redirty(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && db->db_state != DB_NOFILL) { /* Already released on initial dirty, so just thaw. */ ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } } } dbuf_dirty_record_t * dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) { rw_enter(&dn->dn_struct_rwlock, RW_READER); IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid); dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE); ASSERT(dn->dn_maxblkid >= blkid); dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP); list_link_init(&dr->dr_dirty_node); list_link_init(&dr->dr_dbuf_node); dr->dr_dnode = dn; dr->dr_txg = tx->tx_txg; dr->dt.dll.dr_blkid = blkid; dr->dr_accounted = dn->dn_datablksz; /* * There should not be any dbuf for the block that we're dirtying. * Otherwise the buffer contents could be inconsistent between the * dbuf and the lightweight dirty record. */ ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid, NULL)); mutex_enter(&dn->dn_mtx); int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] != NULL) { range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1); } if (dn->dn_nlevels == 1) { ASSERT3U(blkid, <, dn->dn_nblkptr); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); dnode_setdirty(dn, tx); } else { mutex_exit(&dn->dn_mtx); int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; dmu_buf_impl_t *parent_db = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); rw_exit(&dn->dn_struct_rwlock); if (parent_db == NULL) { kmem_free(dr, sizeof (*dr)); return (NULL); } int err = dbuf_read(parent_db, NULL, (DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err != 0) { dbuf_rele(parent_db, FTAG); kmem_free(dr, sizeof (*dr)); return (NULL); } dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx); dbuf_rele(parent_db, FTAG); mutex_enter(&parent_dr->dt.di.dr_mtx); ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg); list_insert_tail(&parent_dr->dt.di.dr_children, dr); mutex_exit(&parent_dr->dt.di.dr_mtx); dr->dr_parent = parent_dr; } dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx); return (dr); } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; dbuf_dirty_record_t *dr, *dr_next, *dr_head; int txgoff = tx->tx_txg & TXG_MASK; boolean_t drop_struct_rwlock = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ #ifdef ZFS_DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) { rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); } ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty * in syncing context. */ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* * XXX make this true for indirects too? The problem is that * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); dnode_set_dirtyctx(dn, tx, db); if (tx->tx_txg > dn->dn_dirty_txg) dn->dn_dirty_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; /* * If this buffer is already dirty, we're done. */ dr_head = list_head(&db->db_dirty_records); ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); dr_next = dbuf_find_dirty_lte(db, tx->tx_txg); if (dr_next && dr_next->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); dbuf_redirty(dr_next); mutex_exit(&db->db_mtx); return (dr_next); } /* * Only valid if not already dirty. */ ASSERT(dn->dn_object == 0 || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); /* * We should only be dirtying in syncing context if it's the * mos or we're initializing the os or it's a special object. * However, we are allowed to dirty in syncing context provided * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ os = dn->dn_objset; VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); #ifdef ZFS_DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) { dmu_objset_willuse_space(os, db->db.db_size, tx); } /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); list_link_init(&dr->dr_dirty_node); list_link_init(&dr->dr_dbuf_node); dr->dr_dnode = dn; if (db->db_level == 0) { void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so * that we can modify it without impacting * possible other users of this cached data * block. Note that indirect blocks and * private objects are not released until the * syncing state (since they are only modified * then). */ arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db_buf; } ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL); list_create(&dr->dt.di.dr_children, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) { dr->dr_accounted = db->db.db_size; } dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; list_insert_before(&db->db_dirty_records, dr_next, dr); /* * We could have been freed_in_flight between the dbuf_noread * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_free_ranges[txgoff] != NULL) { range_tree_clear(dn->dn_free_ranges[txgoff], db->db_blkid, 1); } mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } /* * This buffer is now part of this txg */ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); db->db_dirtycnt += 1; ASSERT3U(db->db_dirtycnt, <=, 3); mutex_exit(&db->db_mtx); if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_rwlock = B_TRUE; } /* * If we are overwriting a dedup BP, then unless it is snapshotted, * when we get to syncing context we will need to decrement its * refcount in the DDT. Prefetch the relevant DDT block so that * syncing context won't have to wait for the i/o. */ if (db->db_blkptr != NULL) { db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); ddt_prefetch(os->os_spa, db->db_blkptr); dmu_buf_unlock_parent(db, dblt, FTAG); } /* * We need to hold the dn_struct_rwlock to make this assertion, * because it protects dn_phys / dn_next_nlevels from changing. */ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || dn->dn_next_nlevels[txgoff] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); if (db->db_level == 0) { ASSERT(!db->db_objset->os_raw_receive || dn->dn_maxblkid >= db->db_blkid); dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_rwlock, B_FALSE); ASSERT(dn->dn_maxblkid >= db->db_blkid); } if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; int parent_held = FALSE; if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level + 1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); /* * Since we've dropped the mutex, it's possible that * dbuf_undirty() might have changed this out from under us. */ if (list_head(&db->db_dirty_records) == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); } else { ASSERT(db->db_level + 1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); } dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } static void dbuf_undirty_bonus(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; if (dr->dt.dl.dr_data != db->db.db_data) { struct dnode *dn = dr->dr_dnode; int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); kmem_free(dr->dt.dl.dr_data, max_bonuslen); arc_space_return(max_bonuslen, ARC_SPACE_BONUS); } db->db_data_pending = NULL; ASSERT(list_next(&db->db_dirty_records, dr) == NULL); list_remove(&db->db_dirty_records, dr); if (dr->dr_dbuf->db_level != 0) { mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT3U(db->db_dirtycnt, >, 0); db->db_dirtycnt -= 1; } /* * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. */ boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { uint64_t txg = tx->tx_txg; boolean_t brtwrite; ASSERT(txg != 0); /* * Due to our use of dn_nlevels below, this can only be called * in open context, unless we are operating on the MOS. * From syncing context, dn_nlevels may be different from the * dn_nlevels used when dbuf was dirtied. */ ASSERT(db->db_objset == dmu_objset_pool(db->db_objset)->dp_meta_objset || txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); /* * If this buffer is not dirty, we're done. */ dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) return (B_FALSE); ASSERT(dr->dr_dbuf == db); brtwrite = dr->dt.dl.dr_brtwrite; if (brtwrite) { /* * We are freeing a block that we cloned in the same * transaction group. */ brt_pending_remove(dmu_objset_spa(db->db_objset), &dr->dt.dl.dr_overridden_by, tx); } dnode_t *dn = dr->dr_dnode; dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), dr->dr_accounted, txg); list_remove(&db->db_dirty_records, dr); /* * Note that there are three places in dbuf_dirty() * where this dirty record may be put on a list. * Make sure to do a list_remove corresponding to * every one of those list_insert calls. */ if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_blkid == DMU_SPILL_BLKID || db->db_level + 1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } if (db->db_state != DB_NOFILL && !brtwrite) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { ASSERT(db->db_state == DB_NOFILL || brtwrite || arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } return (B_FALSE); } static void dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; boolean_t undirty = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); /* * Quick check for dirtiness. For already dirty blocks, this * reduces runtime of this function by >90%, and overall performance * by 50% for some workloads (e.g. file deletion with indirect blocks * cached). */ mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) { dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); /* * It's possible that it is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ if (dr != NULL) { if (dr->dt.dl.dr_brtwrite) { /* * Block cloning: If we are dirtying a cloned * block, we cannot simply redirty it, because * this dr has no data associated with it. * We will go through a full undirtying below, * before dirtying it again. */ undirty = B_TRUE; } else { /* This dbuf is already dirty and cached. */ dbuf_redirty(dr); mutex_exit(&db->db_mtx); return; } } } mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) flags |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); /* * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we * want to make sure dbuf_read() will read the pending cloned block and * not the uderlying block that is being replaced. dbuf_undirty() will * do dbuf_unoverride(), so we will end up with cloned block content, * without overridden BP. */ (void) dbuf_read(db, NULL, flags); if (undirty) { mutex_enter(&db->db_mtx); VERIFY(!dbuf_undirty(db, tx)); mutex_exit(&db->db_mtx); } (void) dbuf_dirty(db, tx); } void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_will_dirty_impl(db_fake, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx); } boolean_t dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_dirty_record_t *dr; mutex_enter(&db->db_mtx); dr = dbuf_find_dirty_eq(db, tx->tx_txg); mutex_exit(&db->db_mtx); return (dr != NULL); } void dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; /* * Block cloning: We are going to clone into this block, so undirty * modifications done to this block so far in this txg. This includes * writes and clones into this block. */ mutex_enter(&db->db_mtx); VERIFY(!dbuf_undirty(db, tx)); ASSERT(list_head(&db->db_dirty_records) == NULL); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; } mutex_exit(&db->db_mtx); dmu_buf_will_not_fill(db_fake, tx); } void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; mutex_enter(&db->db_mtx); db->db_state = DB_NOFILL; DTRACE_SET_STATE(db, "allocating NOFILL buffer"); mutex_exit(&db->db_mtx); dbuf_noread(db); (void) dbuf_dirty(db, tx); } void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); mutex_enter(&db->db_mtx); if (db->db_state == DB_NOFILL) { /* * Block cloning: We will be completely overwriting a block * cloned in this transaction group, so let's undirty the * pending clone and mark the block as uncached. This will be * as if the clone was never done. */ VERIFY(!dbuf_undirty(db, tx)); db->db_state = DB_UNCACHED; } mutex_exit(&db->db_mtx); dbuf_noread(db); (void) dbuf_dirty(db, tx); } /* * This function is effectively the same as dmu_buf_will_dirty(), but * indicates the caller expects raw encrypted data in the db, and provides * the crypt params (byteorder, salt, iv, mac) which should be stored in the * blkptr_t when this dbuf is written. This is only used for blocks of * dnodes, during raw receive. */ void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_dirty_record_t *dr; /* * dr_has_raw_params is only processed for blocks of dnodes * (see dbuf_sync_dnode_leaf_crypt()). */ ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); ASSERT3U(db->db_level, ==, 0); ASSERT(db->db_objset->os_raw_receive); dmu_buf_will_dirty_impl(db_fake, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); dr = dbuf_find_dirty_eq(db, tx->tx_txg); ASSERT3P(dr, !=, NULL); dr->dt.dl.dr_has_raw_params = B_TRUE; dr->dt.dl.dr_byteorder = byteorder; memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN); memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN); memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN); } static void dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) { struct dirty_leaf *dl; dbuf_dirty_record_t *dr; dr = list_head(&db->db_dirty_records); ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; dl->dr_overridden_by = *bp; dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = dr->dr_txg; } void dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx) { (void) tx; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; dbuf_states_t old_state; mutex_enter(&db->db_mtx); DBUF_VERIFY(db); old_state = db->db_state; db->db_state = DB_CACHED; if (old_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ memset(db->db.db_data, 0, db->db.db_size); db->db_freed_in_flight = FALSE; DTRACE_SET_STATE(db, "fill done handling freed in flight"); } else { DTRACE_SET_STATE(db, "fill done"); } cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); } void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; struct dirty_leaf *dl; dmu_object_type_t type; dbuf_dirty_record_t *dr; if (etype == BP_EMBEDDED_TYPE_DATA) { ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), SPA_FEATURE_EMBEDDED_DATA)); } DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); dmu_buf_will_not_fill(dbuf, tx); dr = list_head(&db->db_dirty_records); ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); BP_SET_TYPE(&dl->dr_overridden_by, type); BP_SET_LEVEL(&dl->dr_overridden_by, 0); BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = dr->dr_txg; } void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; dmu_object_type_t type; ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); dmu_buf_will_not_fill(dbuf, tx); blkptr_t bp = { { { {0} } } }; BP_SET_TYPE(&bp, type); BP_SET_LEVEL(&bp, 0); BP_SET_BIRTH(&bp, tx->tx_txg, 0); BP_SET_REDACTED(&bp); BPE_SET_LSIZE(&bp, dbuf->db_size); dbuf_override_impl(db, &bp, tx); } /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); ASSERT(buf != NULL); ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); if (db->db_state == DB_CACHED && zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { /* * In practice, we will never have a case where we have an * encrypted arc buffer while additional holds exist on the * dbuf. We don't handle this here so we simply assert that * fact instead. */ ASSERT(!arc_is_encrypted(buf)); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); memcpy(db->db.db_data, buf->b_data, db->db.db_size); arc_buf_destroy(buf, db); return; } if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); if (!arc_released(db->db_buf)) { ASSERT(dr->dt.dl.dr_override_state == DR_OVERRIDDEN); arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); db->db_state = DB_FILL; DTRACE_SET_STATE(db, "filling assigned arcbuf"); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); dmu_buf_fill_done(&db->db, tx); } void dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(zfs_refcount_is_zero(&db->db_holds)); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; } if (db->db_blkid == DMU_BONUS_BLKID) { int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); if (db->db.db_data != NULL) { kmem_free(db->db.db_data, bonuslen); arc_space_return(bonuslen, ARC_SPACE_BONUS); db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "buffer cleared"); } } dbuf_clear_data(db); if (multilist_link_active(&db->db_cache_link)) { ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); } db->db_caching_status = DB_NO_CACHE; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); ASSERT(list_is_empty(&db->db_dirty_records)); db->db_state = DB_EVICTING; DTRACE_SET_STATE(db, "buffer eviction started"); db->db_blkptr = NULL; /* * Now that db_state is DB_EVICTING, nobody else can find this via * the hash table. We can now drop db_mtx, which allows us to * acquire the dn_dbufs_mtx. */ mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID) { boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); if (needlock) mutex_enter_nested(&dn->dn_dbufs_mtx, NESTED_SINGLE); avl_remove(&dn->dn_dbufs, db); membar_producer(); DB_DNODE_EXIT(db); if (needlock) mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), * so the dnode cannot be moved until after we release the hold. * The membar_producer() ensures visibility of the decremented * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, db, B_TRUE); db->db_dnode_handle = NULL; dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } ASSERT(zfs_refcount_is_zero(&db->db_holds)); db->db_parent = NULL; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT(!multilist_link_active(&db->db_cache_link)); /* * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) { mutex_enter(&parent->db_mtx); dbuf_rele_and_unlock(parent, db, B_TRUE); } kmem_cache_free(dbuf_kmem_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); } /* * Note: While bpp will always be updated if the function returns success, * parentp will not be updated if the dnode does not have dn_dbuf filled in; * this happens when the dnode is the meta-dnode, or {user|group|project}used * object. */ __attribute__((always_inline)) static inline int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp) { *parentp = NULL; *bpp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); if (blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) *bpp = DN_SPILL_BLKPTR(dn->dn_phys); else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; mutex_exit(&dn->dn_mtx); return (0); } int nlevels = (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(level * epbs, <, 64); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); /* * This assertion shouldn't trip as long as the max indirect block size * is less than 1M. The reason for this is that up to that point, * the number of levels required to address an entire object with blocks * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55 * (i.e. we can address the entire object), objects will all use at most * N-1 levels and the assertion won't overflow. However, once epbs is * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be * enough to address an entire object, so objects will have 5 levels, * but then this assertion will overflow. * * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we * need to redo this logic to handle overflows. */ ASSERT(level >= nlevels || ((nlevels - level - 1) * epbs) + highbit64(dn->dn_phys->dn_nblkptr) <= 64); if (level >= nlevels || blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr << ((nlevels - level - 1) * epbs)) || (fail_sparse && blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ return (SET_ERROR(ENOENT)); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err; err = dbuf_hold_impl(dn, level + 1, blkid >> epbs, fail_sparse, FALSE, NULL, parentp); if (err) return (err); err = dbuf_read(*parentp, NULL, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; return (err); } rw_enter(&(*parentp)->db_rwlock, RW_READER); *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) ASSERT(BP_IS_HOLE(*bpp)); rw_exit(&(*parentp)->db_rwlock); return (0); } else { /* the block is referenced from the dnode */ ASSERT3U(level, ==, nlevels-1); ASSERT(dn->dn_phys->dn_nblkptr == 0 || blkid < dn->dn_phys->dn_nblkptr); if (dn->dn_dbuf) { dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; } *bpp = &dn->dn_phys->dn_blkptr[blkid]; return (0); } } static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dbuf_node)); db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; db->db_hash = hash; db->db_user = NULL; db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; db->db_pending_evict = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "bonus buffer created"); db->db_caching_status = DB_NO_CACHE; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /* * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. * This prevents a possible deadlock with someone * trying to look up this dbuf before it's added to the * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); db->db_state = DB_EVICTING; /* not worth logging this state change */ if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ mutex_exit(&dn->dn_dbufs_mtx); kmem_cache_free(dbuf_kmem_cache, db); DBUF_STAT_BUMP(hash_insert_race); return (odb); } avl_add(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "regular buffer created"); db->db_caching_status = DB_NO_CACHE; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || zfs_refcount_count(&dn->dn_holds) > 0); (void) zfs_refcount_add(&dn->dn_holds, db); dprintf_dbuf(db, "db=%p\n", db); return (db); } /* * This function returns a block pointer and information about the object, * given a dnode and a block. This is a publicly accessible version of * dbuf_findbp that only returns some information, rather than the * dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock * should be locked as (at least) a reader. */ int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift) { dmu_buf_impl_t *dbp = NULL; blkptr_t *bp2; int err = 0; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2); if (err == 0) { ASSERT3P(bp2, !=, NULL); *bp = *bp2; if (dbp != NULL) dbuf_rele(dbp, NULL); if (datablkszsec != NULL) *datablkszsec = dn->dn_phys->dn_datablkszsec; if (indblkshift != NULL) *indblkshift = dn->dn_phys->dn_indblkshift; } return (err); } typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */ void *dpa_arg; /* prefetch completion arg */ } dbuf_prefetch_arg_t; static void dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done) { if (dpa->dpa_cb != NULL) { dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level, dpa->dpa_zb.zb_blkid, io_done); } kmem_free(dpa, sizeof (*dpa)); } static void dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *iobp, arc_buf_t *abuf, void *private) { (void) zio, (void) zb, (void) iobp; dbuf_prefetch_arg_t *dpa = private; if (abuf != NULL) arc_buf_destroy(abuf, private); dbuf_prefetch_fini(dpa, B_TRUE); } /* * Actually issue the prefetch read for the block given. */ static void dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) { ASSERT(!BP_IS_REDACTED(bp) || dsl_dataset_feature_is_active( dpa->dpa_dnode->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return (dbuf_prefetch_fini(dpa, B_FALSE)); int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | ARC_FLAG_NO_BUF; /* dnodes are always read as raw and then converted later */ if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) && dpa->dpa_curlevel == 0) zio_flags |= ZIO_FLAG_RAW; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, dbuf_issue_final_prefetch_done, dpa, dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb); } /* * Called when an indirect block above our prefetch target is read in. This * will either read in the next indirect block down the tree or issue the actual * prefetch if the next block down is our target. */ static void dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *iobp, arc_buf_t *abuf, void *private) { (void) zb, (void) iobp; dbuf_prefetch_arg_t *dpa = private; ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); if (abuf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); dbuf_prefetch_fini(dpa, B_TRUE); return; } ASSERT(zio == NULL || zio->io_error == 0); /* * The dpa_dnode is only valid if we are called with a NULL * zio. This indicates that the arc_read() returned without * first calling zio_read() to issue a physical read. Once * a physical read is made the dpa_dnode must be invalidated * as the locks guarding it may have been dropped. If the * dpa_dnode is still valid, then we want to add it to the dbuf * cache. To do so, we must hold the dbuf associated with the block * we just prefetched, read its contents so that we associate it * with an arc_buf_t, and then release it. */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) { ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); dpa->dpa_dnode = NULL; } else if (dpa->dpa_dnode != NULL) { uint64_t curblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, dpa->dpa_curlevel, curblkid, FTAG); if (db == NULL) { arc_buf_destroy(abuf, private); dbuf_prefetch_fini(dpa, B_TRUE); return; } (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); dbuf_rele(db, FTAG); } dpa->dpa_curlevel--; uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode && dsl_dataset_feature_is_active( dpa->dpa_dnode->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS))); if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { arc_buf_destroy(abuf, private); dbuf_prefetch_fini(dpa, B_TRUE); return; } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) iter_aflags |= ARC_FLAG_L2CACHE; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, dbuf_prefetch_indirect_done, dpa, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } arc_buf_destroy(abuf, private); } /* * Issue prefetch reads for the given block on the given level. If the indirect * blocks above that block are not in memory, we will read them in * asynchronously. As a result, this call never blocks waiting for a read to * complete. Note that the prefetch might fail if the dataset is encrypted and * the encryption key is unmapped before the IO completes. */ int dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, void *arg) { blkptr_t bp; int epbs, nlevels, curlevel; uint64_t curblkid; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (blkid > dn->dn_maxblkid) goto no_issue; if (level == 0 && dnode_block_freed(dn, blkid)) goto no_issue; /* * This dnode hasn't been written to disk yet, so there's nothing to * prefetch. */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) goto no_issue; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) goto no_issue; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, NULL); if (db != NULL) { mutex_exit(&db->db_mtx); /* * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ goto no_issue; } /* * Find the closest ancestor (indirect block) of the target block * that is present in the cache. In this indirect block, we will * find the bp that is at curlevel, curblkid. */ curlevel = level; curblkid = blkid; while (curlevel < nlevels - 1) { int parent_level = curlevel + 1; uint64_t parent_blkid = curblkid >> epbs; dmu_buf_impl_t *db; if (dbuf_hold_impl(dn, parent_level, parent_blkid, FALSE, TRUE, FTAG, &db) == 0) { blkptr_t *bpp = db->db_buf->b_data; bp = bpp[P2PHASE(curblkid, 1 << epbs)]; dbuf_rele(db, FTAG); break; } curlevel = parent_level; curblkid = parent_blkid; } if (curlevel == nlevels - 1) { /* No cached indirect blocks found. */ ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); bp = dn->dn_phys->dn_blkptr[curblkid]; } ASSERT(!BP_IS_REDACTED(&bp) || dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp)) goto no_issue; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, level, blkid); dpa->dpa_curlevel = curlevel; dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; dpa->dpa_cb = cb; dpa->dpa_arg = arg; if (!DNODE_LEVEL_IS_CACHEABLE(dn, level)) dpa->dpa_aflags |= ARC_FLAG_UNCACHED; else if (dnode_level_is_l2cacheable(&bp, dn, level)) dpa->dpa_aflags |= ARC_FLAG_L2CACHE; /* * If we have the indirect just above us, no need to do the asynchronous * prefetch chain; we'll just run the last step ourselves. If we're at * a higher level, though, we want to issue the prefetches for all the * indirect blocks asynchronously, so we can go on with whatever we were * doing. */ if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (dnode_level_is_l2cacheable(&bp, dn, level)) iter_aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, curlevel, curblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, &bp, dbuf_prefetch_indirect_done, dpa, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } /* * We use pio here instead of dpa_zio since it's possible that * dpa may have already been freed. */ zio_nowait(pio); return (1); no_issue: if (cb != NULL) cb(arg, level, blkid, B_FALSE); return (0); } int dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags) { return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL)); } /* * Helper function for dbuf_hold_impl() to copy a buffer. Handles * the case of encrypted, compressed and uncompressed buffers by * allocating the new buffer, respectively, with arc_alloc_raw_buf(), * arc_alloc_compressed_buf() or arc_alloc_buf().* * * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl(). */ noinline static void dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db) { dbuf_dirty_record_t *dr = db->db_data_pending; arc_buf_t *data = dr->dt.dl.dr_data; enum zio_compress compress_type = arc_get_compression(data); uint8_t complevel = arc_get_complevel(data); if (arc_is_encrypted(data)) { boolean_t byteorder; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; arc_get_raw_params(data, &byteorder, salt, iv, mac); dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db, dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac, dn->dn_type, arc_buf_size(data), arc_buf_lsize(data), compress_type, complevel)); } else if (compress_type != ZIO_COMPRESS_OFF) { dbuf_set_data(db, arc_alloc_compressed_buf( dn->dn_objset->os_spa, db, arc_buf_size(data), arc_buf_lsize(data), compress_type, complevel)); } else { dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size)); } rw_enter(&db->db_rwlock, RW_WRITER); memcpy(db->db.db_data, data->b_data, arc_buf_size(data)); rw_exit(&db->db_rwlock); } /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, const void *tag, dmu_buf_impl_t **dbp) { dmu_buf_impl_t *db, *parent = NULL; uint64_t hv; /* If the pool has been created, verify the tx_sync_lock is not held */ spa_t *spa = dn->dn_objset->os_spa; dsl_pool_t *dp = spa->spa_dsl_pool; if (dp != NULL) { ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock)); } ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); *dbp = NULL; /* dbuf_find() returns with db_mtx held */ db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv); if (db == NULL) { blkptr_t *bp = NULL; int err; if (fail_uncached) return (SET_ERROR(ENOENT)); ASSERT3P(parent, ==, NULL); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { if (err == 0 && bp && BP_IS_HOLE(bp)) err = SET_ERROR(ENOENT); if (err) { if (parent) dbuf_rele(parent, NULL); return (err); } } if (err && err != ENOENT) return (err); db = dbuf_create(dn, level, blkid, parent, bp, hv); } if (fail_uncached && db->db_state != DB_CACHED) { mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } if (db->db_buf != NULL) { arc_buf_access(db->db_buf); ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); } ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); /* * If this buffer is currently syncing out, and we are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; if (dr->dt.dl.dr_data == db->db_buf) { ASSERT3P(db->db_buf, !=, NULL); dbuf_hold_copy(dn, db); } } if (multilist_link_active(&db->db_cache_link)) { ASSERT(zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); } db->db_caching_status = DB_NO_CACHE; } (void) zfs_refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (parent) dbuf_rele(parent, NULL); ASSERT3P(DB_DNODE(db), ==, dn); ASSERT3U(db->db_blkid, ==, blkid); ASSERT3U(db->db_level, ==, level); *dbp = db; return (0); } dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag) { return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); return (err ? NULL : db); } void dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL, dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID)); } int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); if (blksz == 0) blksz = SPA_MINBLOCKSIZE; ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); dbuf_new_size(db, blksz, tx); return (0); } void dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) { dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref void dbuf_add_ref(dmu_buf_impl_t *db, const void *tag) { int64_t holds = zfs_refcount_add(&db->db_holds, tag); VERIFY3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref boolean_t dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, const void *tag) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *found_db; boolean_t result = B_FALSE; if (blkid == DMU_BONUS_BLKID) found_db = dbuf_find_bonus(os, obj); else found_db = dbuf_find(os, obj, 0, blkid, NULL); if (found_db != NULL) { if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { (void) zfs_refcount_add(&db->db_holds, tag); result = B_TRUE; } mutex_exit(&found_db->db_mtx); } return (result); } /* * If you call dbuf_rele() you had better not be referencing the dnode handle * unless you have some other direct or indirect hold on the dnode. (An indirect * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ void dbuf_rele(dmu_buf_impl_t *db, const void *tag) { mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, tag, B_FALSE); } void dmu_buf_rele(dmu_buf_t *db, const void *tag) { dbuf_rele((dmu_buf_impl_t *)db, tag); } /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. The 'evicting' * argument should be set if we are already in the dbuf-evicting code * path, in which case we don't want to recursively evict. This allows us to * avoid deeply nested stacks that would have a call flow similar to this: * * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() * ^ | * | | * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ * */ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) { int64_t holds; uint64_t size; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus * buffer has a corresponding dnode hold. */ holds = zfs_refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); /* * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ if (db->db_buf != NULL && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) dbuf_evict_user(db); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn; boolean_t evict_dbuf = db->db_pending_evict; /* * If the dnode moves here, we cannot cross this * barrier until the move completes. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); atomic_dec_32(&dn->dn_dbufs_count); /* * Decrementing the dbuf count means that the bonus * buffer's dnode hold is no longer discounted in * dnode_move(). The dnode cannot move until after * the dnode_rele() below. */ DB_DNODE_EXIT(db); /* * Do not reference db after its lock is dropped. * Another thread may evict it. */ mutex_exit(&db->db_mtx); if (evict_dbuf) dnode_evict_bonus(dn); dnode_rele(dn, db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); dbuf_destroy(db); } else if (arc_released(db->db_buf)) { /* * This dbuf has anonymous data associated with it. */ dbuf_destroy(db); } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) || db->db_pending_evict) { dbuf_destroy(db); } else if (!multilist_link_active(&db->db_cache_link)) { ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); dbuf_cached_state_t dcs = dbuf_include_in_metadata_cache(db) ? DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; db->db_caching_status = dcs; multilist_insert(&dbuf_caches[dcs].cache, db); uint64_t db_size = db->db.db_size; size = zfs_refcount_add_many( &dbuf_caches[dcs].size, db_size, db); uint8_t db_level = db->db_level; mutex_exit(&db->db_mtx); if (dcs == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMP(metadata_cache_count); DBUF_STAT_MAX(metadata_cache_size_bytes_max, size); } else { DBUF_STAT_BUMP(cache_count); DBUF_STAT_MAX(cache_size_bytes_max, size); DBUF_STAT_BUMP(cache_levels[db_level]); DBUF_STAT_INCR(cache_levels_bytes[db_level], db_size); } if (dcs == DB_DBUF_CACHE && !evicting) dbuf_evict_notify(size); } } else { mutex_exit(&db->db_mtx); } } #pragma weak dmu_buf_refcount = dbuf_refcount uint64_t dbuf_refcount(dmu_buf_impl_t *db) { return (zfs_refcount_count(&db->db_holds)); } uint64_t dmu_buf_user_refcount(dmu_buf_t *db_fake) { uint64_t holds; dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; mutex_enter(&db->db_mtx); ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt); holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt; mutex_exit(&db->db_mtx); return (holds); } void * dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; mutex_enter(&db->db_mtx); dbuf_verify_user(db, DBVU_NOT_EVICTING); if (db->db_user == old_user) db->db_user = new_user; else old_user = db->db_user; dbuf_verify_user(db, DBVU_NOT_EVICTING); mutex_exit(&db->db_mtx); return (old_user); } void * dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, NULL, user)); } void * dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_user_immediate_evict = TRUE; return (dmu_buf_set_user(db_fake, user)); } void * dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_verify_user(db, DBVU_NOT_EVICTING); return (db->db_user); } void dmu_buf_user_evict_wait(void) { taskq_wait(dbu_evict_taskq); } blkptr_t * dmu_buf_get_blkptr(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_blkptr); } objset_t * dmu_buf_get_objset(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_objset); } dnode_t * dmu_buf_dnode_enter(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_ENTER(dbi); return (DB_DNODE(dbi)); } void dmu_buf_dnode_exit(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_EXIT(dbi); } static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { /* ASSERT(dmu_tx_is_syncing(tx) */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_blkptr != NULL) return; if (db->db_blkid == DMU_SPILL_BLKID) { db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys); BP_ZERO(db->db_blkptr); return; } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was * no available blkptrs from the dnode, or it was * inappropriate to hook it in (i.e., nlevels mismatch). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ASSERT(db->db_parent == NULL); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); } else { dmu_buf_impl_t *parent = db->db_parent; int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT(dn->dn_phys->dn_nlevels > 1); if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; } db->db_blkptr = (blkptr_t *)parent->db.db_data + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } } static void dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; void *data = dr->dt.dl.dr_data; ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_blkid == DMU_BONUS_BLKID); ASSERT(data != NULL); dnode_t *dn = dr->dr_dnode; ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys)); dbuf_sync_leaf_verify_bonus_dnode(dr); dbuf_undirty_bonus(dr); dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); } /* * When syncing out a blocks of dnodes, adjust the block to deal with * encryption. Normally, we make sure the block is decrypted before writing * it. If we have crypt params, then we are writing a raw (encrypted) block, * from a raw receive. In this case, set the ARC buf's crypt params so * that the BP will be filled with the correct byteorder, salt, iv, and mac. */ static void dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr) { int err; dmu_buf_impl_t *db = dr->dr_dbuf; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); ASSERT3U(db->db_level, ==, 0); if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) { zbookmark_phys_t zb; /* * Unfortunately, there is currently no mechanism for * syncing context to handle decryption errors. An error * here is only possible if an attacker maliciously * changed a dnode block and updated the associated * checksums going up the block tree. */ SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), db->db.db_object, db->db_level, db->db_blkid); err = arc_untransform(db->db_buf, db->db_objset->os_spa, &zb, B_TRUE); if (err) panic("Invalid dnode block MAC"); } else if (dr->dt.dl.dr_has_raw_params) { (void) arc_release(dr->dt.dl.dr_data, db); arc_convert_to_raw(dr->dt.dl.dr_data, dmu_objset_id(db->db_objset), dr->dt.dl.dr_byteorder, DMU_OT_DNODE, dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac); } } /* * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it * is critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn = dr->dr_dnode; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); ASSERT(db->db_level > 0); DBUF_VERIFY(db); /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, db->db_buf, tx); zio_t *zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); } /* * Verify that the size of the data in our bonus buffer does not exceed * its recorded size. * * The purpose of this verification is to catch any cases in development * where the size of a phys structure (i.e space_map_phys_t) grows and, * due to incorrect feature management, older pools expect to read more * data even though they didn't actually write it to begin with. * * For a example, this would catch an error in the feature logic where we * open an older pool and we expect to write the space map histogram of * a space map with size SPACE_MAP_SIZE_V0. */ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) { #ifdef ZFS_DEBUG dnode_t *dn = dr->dr_dnode; /* * Encrypted bonus buffers can have data past their bonuslen. * Skip the verification of these blocks. */ if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)) return; uint16_t bonuslen = dn->dn_phys->dn_bonuslen; uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(bonuslen, <=, maxbonuslen); arc_buf_t *datap = dr->dt.dl.dr_data; char *datap_end = ((char *)datap) + bonuslen; char *datap_max = ((char *)datap) + maxbonuslen; /* ensure that everything is zero after our data */ for (; datap_end < datap_max; datap_end++) ASSERT(*datap_end == 0); #endif } static blkptr_t * dbuf_lightweight_bp(dbuf_dirty_record_t *dr) { /* This must be a lightweight dirty record. */ ASSERT3P(dr->dr_dbuf, ==, NULL); dnode_t *dn = dr->dr_dnode; if (dn->dn_phys->dn_nlevels == 1) { VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr); return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]); } else { dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; VERIFY3U(parent_db->db_level, ==, 1); VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn); VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); blkptr_t *bp = parent_db->db.db_data; return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); } } static void dbuf_lightweight_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; blkptr_t *bp = zio->io_bp; if (zio->io_error != 0) return; dnode_t *dn = dr->dr_dnode; blkptr_t *bp_orig = dbuf_lightweight_bp(dr); spa_t *spa = dmu_objset_spa(dn->dn_objset); int64_t delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta); uint64_t blkid = dr->dt.dll.dr_blkid; mutex_enter(&dn->dn_mtx); if (blkid > dn->dn_phys->dn_maxblkid) { ASSERT0(dn->dn_objset->os_raw_receive); dn->dn_phys->dn_maxblkid = blkid; } mutex_exit(&dn->dn_mtx); if (!BP_IS_EMBEDDED(bp)) { uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1; BP_SET_FILL(bp, fill); } dmu_buf_impl_t *parent_db; EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); if (dr->dr_parent == NULL) { parent_db = dn->dn_dbuf; } else { parent_db = dr->dr_parent->dr_dbuf; } rw_enter(&parent_db->db_rwlock, RW_WRITER); *bp_orig = *bp; rw_exit(&parent_db->db_rwlock); } -static void -dbuf_lightweight_physdone(zio_t *zio) -{ - dbuf_dirty_record_t *dr = zio->io_private; - dsl_pool_t *dp = spa_get_dsl(zio->io_spa); - ASSERT3U(dr->dr_txg, ==, zio->io_txg); - - /* - * The callback will be called io_phys_children times. Retire one - * portion of our dirty space each time we are called. Any rounding - * error will be cleaned up by dbuf_lightweight_done(). - */ - int delta = dr->dr_accounted / zio->io_phys_children; - dsl_pool_undirty_space(dp, delta, zio->io_txg); -} - static void dbuf_lightweight_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; VERIFY0(zio->io_error); objset_t *os = dr->dr_dnode->dn_objset; dmu_tx_t *tx = os->os_synctx; if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, zio->io_bp, tx); } - /* - * See comment in dbuf_write_done(). - */ - if (zio->io_phys_children == 0) { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted, zio->io_txg); - } else { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted % zio->io_phys_children, zio->io_txg); - } + dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, + zio->io_txg); abd_free(dr->dt.dll.dr_abd); kmem_free(dr, sizeof (*dr)); } noinline static void dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dnode_t *dn = dr->dr_dnode; zio_t *pio; if (dn->dn_phys->dn_nlevels == 1) { pio = dn->dn_zio; } else { pio = dr->dr_parent->dr_zio; } zbookmark_phys_t zb = { .zb_objset = dmu_objset_id(dn->dn_objset), .zb_object = dn->dn_object, .zb_level = 0, .zb_blkid = dr->dt.dll.dr_blkid, }; /* * See comment in dbuf_write(). This is so that zio->io_bp_orig * will have the old BP in dbuf_lightweight_done(). */ dr->dr_bp_copy = *dbuf_lightweight_bp(dr); dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset), dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd, dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd), &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL, - dbuf_lightweight_physdone, dbuf_lightweight_done, dr, - ZIO_PRIORITY_ASYNC_WRITE, + dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); zio_nowait(dr->dr_zio); } /* * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is * critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn = dr->dr_dnode; objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } DBUF_VERIFY(db); if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { /* * In the previous transaction group, the bonus buffer * was entirely used to store the attributes for the * dnode which overrode the dn_spill field. However, * when adding more attributes to the file a spill * block was required to hold the extra attributes. * * Make sure to clear the garbage left in the dn_spill * field from the previous attributes in the bonus * buffer. Otherwise, after writing out the spill * block to the new allocated dva, it will free * the old block pointed to by the invalid dn_spill. */ db->db_blkptr = NULL; } dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dr->dr_dbuf == db); dbuf_sync_bonus(dr, tx); return; } os = dn->dn_objset; /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we * don't check the dr_override_state until we have returned from * dbuf_check_blkptr. */ dbuf_check_blkptr(dn, db); /* * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); cv_wait(&db->db_changed, &db->db_mtx); } /* * If this is a dnode block, ensure it is appropriately encrypted * or decrypted, depending on what we are writing to it this txg. */ if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT) dbuf_prepare_encrypted_dnode_leaf(dr); if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && zfs_refcount_count(&db->db_holds) > 1 && dr->dt.dl.dr_override_state != DR_OVERRIDDEN && *datap == db->db_buf) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), * then make a copy before we start the write so that * any modifications from the open txg will not leak * into this write. * * NOTE: this copy does not need to be made for * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ int psize = arc_buf_size(*datap); int lsize = arc_buf_lsize(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); enum zio_compress compress_type = arc_get_compression(*datap); uint8_t complevel = arc_get_complevel(*datap); if (arc_is_encrypted(*datap)) { boolean_t byteorder; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; arc_get_raw_params(*datap, &byteorder, salt, iv, mac); *datap = arc_alloc_raw_buf(os->os_spa, db, dmu_objset_id(os), byteorder, salt, iv, mac, dn->dn_type, psize, lsize, compress_type, complevel); } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); *datap = arc_alloc_compressed_buf(os->os_spa, db, psize, lsize, compress_type, complevel); } else { *datap = arc_alloc_buf(os->os_spa, db, type, psize); } memcpy((*datap)->b_data, db->db.db_data, psize); } db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr); } else { zio_nowait(dr->dr_zio); } } void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; while ((dr = list_head(list))) { if (dr->dr_zio != NULL) { /* * If we find an already initialized zio then we * are processing the meta-dnode, and we have finished. * The dbufs for all dnodes are put back on the list * during processing, so that we can zio_wait() * these IOs after initiating all child IOs. */ ASSERT3U(dr->dr_dbuf->db.db_object, ==, DMU_META_DNODE_OBJECT); break; } list_remove(list, dr); if (dr->dr_dbuf == NULL) { dbuf_sync_lightweight(dr, tx); } else { if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { VERIFY3U(dr->dr_dbuf->db_level, ==, level); } if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); else dbuf_sync_leaf(dr, tx); } } } static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { (void) buf; dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; int i; ASSERT3P(db->db_blkptr, !=, NULL); ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (bp->blk_birth != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_bonustype) || BP_IS_EMBEDDED(bp)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); } mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(bp)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); } #endif if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); if (db->db_blkid > dn->dn_phys->dn_maxblkid && db->db_blkid != DMU_SPILL_BLKID) { ASSERT0(db->db_objset->os_raw_receive); dn->dn_phys->dn_maxblkid = db->db_blkid; } mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { i = 0; while (i < db->db.db_size) { dnode_phys_t *dnp = (void *)(((char *)db->db.db_data) + i); i += DNODE_MIN_SIZE; if (dnp->dn_type != DMU_OT_NONE) { fill++; for (int j = 0; j < dnp->dn_nblkptr; j++) { (void) zfs_blkptr_verify(spa, &dnp->dn_blkptr[j], BLK_CONFIG_SKIP, BLK_VERIFY_HALT); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { (void) zfs_blkptr_verify(spa, DN_SPILL_BLKPTR(dnp), BLK_CONFIG_SKIP, BLK_VERIFY_HALT); } i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } } } else { if (BP_IS_HOLE(bp)) { fill = 0; } else { fill = 1; } } } else { blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; (void) zfs_blkptr_verify(spa, ibp, BLK_CONFIG_SKIP, BLK_VERIFY_HALT); fill += BP_GET_FILL(ibp); } } DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) BP_SET_FILL(bp, fill); mutex_exit(&db->db_mtx); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG); *db->db_blkptr = *bp; dmu_buf_unlock_parent(db, dblt, FTAG); } /* * This function gets called just prior to running through the compression * stage of the zio pipeline. If we're an indirect block comprised of only * holes, then we want this indirect to be compressed away to a hole. In * order to do that we must zero out any information about the holes that * this indirect points to prior to before we try to compress it. */ static void dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { (void) zio, (void) buf; dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp; unsigned int epbs, i; ASSERT3U(db->db_level, >, 0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(epbs, <, 31); /* Determine if all our children are holes */ for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) break; } /* * If all the children are holes, then zero them all out so that * we may get compressed away. */ if (i == 1ULL << epbs) { /* * We only found holes. Grab the rwlock to prevent * anybody from reading the blocks we're about to * zero out. */ rw_enter(&db->db_rwlock, RW_WRITER); memset(db->db.db_data, 0, db->db.db_size); rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); } -/* - * The SPA will call this callback several times for each zio - once - * for every physical child i/o (zio->io_phys_children times). This - * allows the DMU to monitor the progress of each logical i/o. For example, - * there may be 2 copies of an indirect block, or many fragments of a RAID-Z - * block. There may be a long delay before all copies/fragments are completed, - * so this callback allows us to retire dirty space gradually, as the physical - * i/os complete. - */ -static void -dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) -{ - (void) buf; - dmu_buf_impl_t *db = arg; - objset_t *os = db->db_objset; - dsl_pool_t *dp = dmu_objset_pool(os); - dbuf_dirty_record_t *dr; - int delta = 0; - - dr = db->db_data_pending; - ASSERT3U(dr->dr_txg, ==, zio->io_txg); - - /* - * The callback will be called io_phys_children times. Retire one - * portion of our dirty space each time we are called. Any rounding - * error will be cleaned up by dbuf_write_done(). - */ - delta = dr->dr_accounted / zio->io_phys_children; - dsl_pool_undirty_space(dp, delta, zio->io_txg); -} - static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { (void) buf; dmu_buf_impl_t *db = vdb; blkptr_t *bp_orig = &zio->io_bp_orig; blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); /* * For nopwrites and rewrites we ensure that the bp matches our * original and bypass all the accounting. */ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } mutex_enter(&db->db_mtx); DBUF_VERIFY(db); dbuf_dirty_record_t *dr = db->db_data_pending; dnode_t *dn = dr->dr_dnode; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); ASSERT(list_next(&db->db_dirty_records, dr) == NULL); list_remove(&db->db_dirty_records, dr); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); } #endif if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != NULL && dr->dt.dl.dr_data != db->db_buf) { arc_buf_destroy(dr->dt.dl.dr_data, db); } } } else { ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { int epbs __maybe_unused = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(db->db_blkid, <=, dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); } mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); - /* - * If we didn't do a physical write in this ZIO and we - * still ended up here, it means that the space of the - * dbuf that we just released (and undirtied) above hasn't - * been marked as undirtied in the pool's accounting. - * - * Thus, we undirty that space in the pool's view of the - * world here. For physical writes this type of update - * happens in dbuf_write_physdone(). - * - * If we did a physical write, cleanup any rounding errors - * that came up due to writing multiple copies of a block - * on disk [see dbuf_write_physdone()]. - */ - if (zio->io_phys_children == 0) { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted, zio->io_txg); - } else { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted % zio->io_phys_children, zio->io_txg); - } + dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, + zio->io_txg); kmem_free(dr, sizeof (dbuf_dirty_record_t)); } static void dbuf_write_nofill_ready(zio_t *zio) { dbuf_write_ready(zio, NULL, zio->io_private); } static void dbuf_write_nofill_done(zio_t *zio) { dbuf_write_done(zio, NULL, zio->io_private); } static void dbuf_write_override_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; dbuf_write_ready(zio, NULL, db); } static void dbuf_write_override_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *obp = &dr->dt.dl.dr_overridden_by; mutex_enter(&db->db_mtx); if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); if (zio->io_abd != NULL) abd_free(zio->io_abd); } typedef struct dbuf_remap_impl_callback_arg { objset_t *drica_os; uint64_t drica_blk_birth; dmu_tx_t *drica_tx; } dbuf_remap_impl_callback_arg_t; static void dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, void *arg) { dbuf_remap_impl_callback_arg_t *drica = arg; objset_t *os = drica->drica_os; spa_t *spa = dmu_objset_spa(os); dmu_tx_t *tx = drica->drica_tx; ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); if (os == spa_meta_objset(spa)) { spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); } else { dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, size, drica->drica_blk_birth, tx); } } static void dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) { blkptr_t bp_copy = *bp; spa_t *spa = dmu_objset_spa(dn->dn_objset); dbuf_remap_impl_callback_arg_t drica; ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); drica.drica_os = dn->dn_objset; drica.drica_blk_birth = bp->blk_birth; drica.drica_tx = tx; if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { /* * If the blkptr being remapped is tracked by a livelist, * then we need to make sure the livelist reflects the update. * First, cancel out the old blkptr by appending a 'FREE' * entry. Next, add an 'ALLOC' to track the new version. This * way we avoid trying to free an inaccurate blkptr at delete. * Note that embedded blkptrs are not tracked in livelists. */ if (dn->dn_objset != spa_meta_objset(spa)) { dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && bp->blk_birth > ds->ds_dir->dd_origin_txg) { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LIVELIST)); bplist_append(&ds->ds_dir->dd_pending_frees, bp); bplist_append(&ds->ds_dir->dd_pending_allocs, &bp_copy); } } /* * The db_rwlock prevents dbuf_read_impl() from * dereferencing the BP while we are changing it. To * avoid lock contention, only grab it when we are actually * changing the BP. */ if (rw != NULL) rw_enter(rw, RW_WRITER); *bp = bp_copy; if (rw != NULL) rw_exit(rw); } } /* * Remap any existing BP's to concrete vdevs, if possible. */ static void dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(db->db_objset); ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) return; if (db->db_level > 0) { blkptr_t *bp = db->db.db_data; for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx); } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, DMU_OT_DNODE); for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i += dnp[i].dn_extra_slots + 1) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL : &dn->dn_dbuf->db_rwlock); dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock, tx); } } } } /* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn = dr->dr_dnode; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_phys_t zb; zio_prop_t zp; zio_t *pio; /* parent I/O */ int wp_flag = 0; ASSERT(dmu_tx_is_syncing(tx)); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* * Private object buffers are released here rather * than in dbuf_dirty() since they are only modified * in the syncing context and we don't want the * overhead of making multiple copies of the data. */ if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { dbuf_release_bp(db); } dbuf_remap(dn, db, tx); } } if (parent != dn->dn_dbuf) { /* Our parent is an indirect block. */ /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); /* * We're about to modify our parent's db_data by modifying * our block pointer, so the parent must be released. */ ASSERT(arc_released(parent->db_buf)); pio = parent->db_data_pending->dr_zio; } else { /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); pio = dn->dn_zio; } ASSERT(db->db_level == 0 || data == db->db_buf); ASSERT3U(db->db_blkptr->blk_birth, <=, txg); ASSERT(pio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); /* * We copy the blkptr now (rather than when we instantiate the dirty * record), because its value can change between open context and * syncing context. We do not need to hold dn_struct_rwlock to read * db_blkptr because we are in syncing context. */ dr->dr_bp_copy = *db->db_blkptr; if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ abd_t *contents = (data != NULL) ? abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, - dbuf_write_override_ready, NULL, NULL, + dbuf_write_override_ready, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, - dbuf_write_nofill_ready, NULL, NULL, + dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); /* * For indirect blocks, we want to setup the children * ready callback so that we can properly handle an indirect * block that only contains holes. */ arc_write_done_func_t *children_ready_cb = NULL; if (db->db_level != 0) children_ready_cb = dbuf_write_children_ready; dr->dr_zio = arc_write(pio, os->os_spa, txg, &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, - children_ready_cb, dbuf_write_physdone, - dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_MUSTSUCCEED, &zb); + children_ready_cb, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } EXPORT_SYMBOL(dbuf_find); EXPORT_SYMBOL(dbuf_is_metadata); EXPORT_SYMBOL(dbuf_destroy); EXPORT_SYMBOL(dbuf_loan_arcbuf); EXPORT_SYMBOL(dbuf_whichblock); EXPORT_SYMBOL(dbuf_read); EXPORT_SYMBOL(dbuf_unoverride); EXPORT_SYMBOL(dbuf_free_range); EXPORT_SYMBOL(dbuf_new_size); EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_is_dirty); EXPORT_SYMBOL(dmu_buf_will_clone); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); EXPORT_SYMBOL(dmu_buf_rele); EXPORT_SYMBOL(dbuf_assign_arcbuf); EXPORT_SYMBOL(dbuf_prefetch); EXPORT_SYMBOL(dbuf_hold_impl); EXPORT_SYMBOL(dbuf_hold); EXPORT_SYMBOL(dbuf_hold_level); EXPORT_SYMBOL(dbuf_create_bonus); EXPORT_SYMBOL(dbuf_spill_set_blksz); EXPORT_SYMBOL(dbuf_rm_spill); EXPORT_SYMBOL(dbuf_add_ref); EXPORT_SYMBOL(dbuf_rele); EXPORT_SYMBOL(dbuf_rele_and_unlock); EXPORT_SYMBOL(dbuf_refcount); EXPORT_SYMBOL(dbuf_sync_list); EXPORT_SYMBOL(dmu_buf_set_user); EXPORT_SYMBOL(dmu_buf_set_user_ie); EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_get_blkptr); ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW, "Maximum size in bytes of the dbuf cache."); ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW, "Percentage over dbuf_cache_max_bytes for direct dbuf eviction."); ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW, "Percentage below dbuf_cache_max_bytes when dbuf eviction stops."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW, "Maximum size in bytes of dbuf metadata cache."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW, "Set size of dbuf cache to log2 fraction of arc size."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW, "Set size of dbuf metadata cache to log2 fraction of arc size."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD, "Set size of dbuf cache mutex array as log2 shift."); diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index 8a13b8f410a1..dda869287c78 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -1,2560 +1,2560 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif /* * Enable/disable nopwrite feature. */ static int zfs_nopwrite_enabled = 1; /* * Tunable to control percentage of dirtied L1 blocks from frees allowed into * one TXG. After this threshold is crossed, additional dirty blocks from frees * will wait until the next TXG. * A value of zero will disable this throttle. */ static uint_t zfs_per_txg_dirty_frees_percent = 30; /* * Enable/disable forcing txg sync when dirty checking for holes with lseek(). * By default this is enabled to ensure accurate hole reporting, it can result * in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads. * Disabling this option will result in holes never being reported in dirty * files which is always safe. */ static int zfs_dmu_offset_next_sync = 1; /* * Limit the amount we can prefetch with one call to this amount. This * helps to limit the amount of memory that can be used by prefetching. * Larger objects should be prefetched a bit at a time. */ uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "object array" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "packed nvlist" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "packed nvlist size" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map" }, {DMU_BSWAP_UINT64, TRUE, FALSE, TRUE, "ZIL intent log" }, {DMU_BSWAP_DNODE, TRUE, FALSE, TRUE, "DMU dnode" }, {DMU_BSWAP_OBJSET, TRUE, TRUE, FALSE, "DMU objset" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL directory" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL directory child map"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset snap map" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL props" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL dataset" }, {DMU_BSWAP_ZNODE, TRUE, FALSE, FALSE, "ZFS znode" }, {DMU_BSWAP_OLDACL, TRUE, FALSE, TRUE, "ZFS V0 ACL" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "ZFS plain file" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS directory" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "ZFS master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS delete queue" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "zvol object" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "zvol prop" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "other uint8[]" }, {DMU_BSWAP_UINT64, FALSE, FALSE, TRUE, "other uint64[]" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "other ZAP" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "persistent error log" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "SPA history" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA history offsets" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "Pool properties" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL permissions" }, {DMU_BSWAP_ACL, TRUE, FALSE, TRUE, "ZFS ACL" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "ZFS SYSACL" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "FUID table" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "FUID table size" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset next clones"}, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan work queue" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project used" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project quota"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "snapshot refcount tags"}, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT ZAP algorithm" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT statistics" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "System attributes" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr registration" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr layouts" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan translations" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "deduplicated block" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL deadlist map" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL deadlist map hdr" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dir clones" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" } }; dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; static int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp) { uint64_t blkid; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (0); } int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (err); } int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_bonus_max(void) { return (DN_OLD_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else if (newsize < 0 || newsize > db_fake->db_size) { error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (!DMU_OT_IS_VALID(type)) { error = SET_ERROR(EINVAL); } else if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); dn = DB_DNODE(db); type = dn->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * Lookup and hold the bonus buffer for the provided dnode. If the dnode * has not yet been allocated a new bonus dbuf a will be allocated. * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp, uint32_t flags) { dmu_buf_impl_t *db; int error; uint32_t db_flags = DB_RF_MUST_SUCCEED; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); } if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (zfs_refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); atomic_inc_32(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); error = dbuf_read(db, NULL, db_flags); if (error) { dnode_evict_bonus(dn); dbuf_rele(db, tag); *dbp = NULL; return (error); } *dbp = &db->db; return (0); } int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else { dbuf_rele(db, tag); *dbp = NULL; } return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = SET_ERROR(EINVAL); } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = SET_ERROR(ENOENT); } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; uint32_t db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; zstream_t *zs = NULL; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio = NULL; boolean_t missed = B_FALSE; ASSERT(!read || length <= DMU_MAX_ACCESS); /* * Note: We directly notify the prefetch code of this read, so that * we can tell it about the multi-block read. dbuf_read() only knows * about the one block it is accessing. */ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; if ((flags & DMU_READ_NO_DECRYPT) != 0) dbuf_flags |= DB_RF_NO_DECRYPT; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); if (read) zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); if ((flags & DMU_READ_NO_PREFETCH) == 0 && length <= zfetch_array_rd_sz) { /* * Prepare the zfetch before initiating the demand reads, so * that if multiple threads block on same indirect block, we * base predictions on the original less racy request order. */ zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read, B_TRUE); } for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { if (zs) dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); if (read) zio_nowait(zio); return (SET_ERROR(EIO)); } /* * Initiate async demand data read. * We check the db_state after calling dbuf_read() because * (1) dbuf_read() may change the state to CACHED due to a * hit in the ARC, and (2) on a cache miss, a child will * have been added to "zio" but not yet completed, so the * state will not yet be CACHED. */ if (read) { if (i == nblks - 1 && blkid + i < dn->dn_maxblkid && offset + length < db->db.db_offset + db->db.db_size) { if (offset <= db->db.db_offset) dbuf_flags |= DB_RF_PARTIAL_FIRST; else dbuf_flags |= DB_RF_PARTIAL_MORE; } (void) dbuf_read(db, zio, dbuf_flags); if (db->db_state != DB_CACHED) missed = B_TRUE; } dbp[i] = &db->db; } if (!read) zfs_racct_write(length, nblks); if (zs) dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); if (read) { /* wait for async read i/o */ err = zio_wait(zio); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /* * Issue prefetch i/os for the given blocks. If level is greater than 0, the * indirect blocks prefetched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. * * Note that if the indirect blocks above the blocks being prefetched are not * in cache, they will be asynchronously read in. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; int nblks, err; if (len == 0) { /* they're interested in the bonus buffer */ dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, level, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } /* * See comment before the definition of dmu_prefetch_max. */ len = MIN(len, dmu_prefetch_max); /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the * last block we want to prefetch, and dbuf_whichblock(dn, level, * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { blkid = dbuf_whichblock(dn, level, offset); for (int i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first * offset that should be freed and l1blks is set to the number of level 1 * indirect blocks found within the chunk. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) { uint64_t blks; uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = (uint64_t)dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT3U(minimum, <=, *start); /* * Check if we can free the entire range assuming that all of the * L1 blocks in this range have data. If we can, we use this * worst case value as an estimate so we can avoid having to look * at the object's actual data. */ uint64_t total_l1blks = (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) / iblkrange; if (total_l1blks <= maxblks) { *l1blks = total_l1blks; *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); for (blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* * dnode_next_offset(BACKWARDS) will find an allocated L1 * indirect block at or before the input offset. We must * decrement *start so that it is at the end of the region * to search. */ (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { *start = minimum; break; } else if (err != 0) { *l1blks = blks; return (err); } /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); } if (*start < minimum) *start = minimum; *l1blks = blks; return (0); } /* * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set, * otherwise return false. * Used below in dmu_free_long_range_impl() to enable abort when unmounting */ static boolean_t dmu_objset_zfs_unmounting(objset_t *os) { #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) return (zfs_get_vfs_flag_unmounted(os)); #else (void) os; #endif return (B_FALSE); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length) { uint64_t object_size; int err; uint64_t dirty_frees_threshold; dsl_pool_t *dp = dmu_objset_pool(os); if (dn == NULL) return (SET_ERROR(EINVAL)); object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; if (offset >= object_size) return (0); if (zfs_per_txg_dirty_frees_percent <= 100) dirty_frees_threshold = zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; else dirty_frees_threshold = zfs_dirty_data_max / 20; if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin, chunk_len; uint64_t l1blks; dmu_tx_t *tx; if (dmu_objset_zfs_unmounting(dn->dn_objset)) return (SET_ERROR(EINTR)); chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ err = get_next_chunk(dn, &chunk_begin, offset, &l1blks); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); chunk_len = chunk_end - chunk_begin; tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); /* * Mark this transaction as typically resulting in a net * reduction in space used. */ dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } uint64_t txg = dmu_tx_get_txg(tx); mutex_enter(&dp->dp_lock); uint64_t long_free_dirty = dp->dp_long_free_dirty_pertxg[txg & TXG_MASK]; mutex_exit(&dp->dp_lock); /* * To avoid filling up a TXG with just frees, wait for * the next TXG to open before freeing more chunks if * we have reached the threshold of frees. */ if (dirty_frees_threshold != 0 && long_free_dirty >= dirty_frees_threshold) { DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay); dmu_tx_commit(tx); txg_wait_open(dp, 0, B_TRUE); continue; } /* * In order to prevent unnecessary write throttling, for each * TXG, we track the cumulative size of L1 blocks being dirtied * in dnode_free_range() below. We compare this number to a * tunable threshold, past which we prevent new L1 dirty freeing * blocks from being added into the open TXG. See * dmu_free_long_range_impl() for details. The threshold * prevents write throttle activation due to dirty freeing L1 * blocks taking up a large percentage of zfs_dirty_data_max. */ mutex_enter(&dp->dp_lock); dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] += l1blks << dn->dn_indblkshift; mutex_exit(&dp->dp_lock); DTRACE_PROBE3(free__long__range, uint64_t, long_free_dirty, uint64_t, chunk_len, uint64_t, txg); dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx); length -= chunk_len; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length); /* * It is important to zero out the maxblkid when freeing the entire * file, so that (a) subsequent calls to dmu_free_long_range_impl() * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); return (err); } int dmu_free_long_object(objset_t *os, uint64_t object) { dmu_tx_t *tx; int err; err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } static int dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dmu_buf_t **dbp; int numbufs, err = 0; /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { uint64_t newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); memset((char *)buf + newsz, 0, size - newsz); size = newsz; } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } return (err); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_read_impl(dn, offset, size, buf, flags); dnode_rele(dn, FTAG); return (err); } int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { return (dmu_read_impl(dn, offset, size, buf, flags)); } static void dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { int i; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } /* * Note: Lustre is an external consumer of this interface. */ void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); VERIFY0(dmu_buf_hold_noread(os, object, offset, FTAG, &db)); dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, uncompressed_size, compressed_size, byteorder, tx); dmu_buf_rele(db, FTAG); } void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { int numbufs, i; dmu_buf_t **dbp; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) dmu_buf_redact(dbp[i], tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } #ifdef _KERNEL int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. * Starting at zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_read_uio_dnode(dn, uio, size); DB_DNODE_EXIT(db); return (err); } /* * Read 'size' bytes into the uio buffer. * From the specified object * Starting at offset zfs_uio_offset(uio). */ int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_read_uio_dnode(dn, uio, size); dnode_rele(dn, FTAG); return (err); } int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); /* * XXX zfs_uiomove could block forever (eg.nfs-backed * pages). There needs to be a uiolockdown() function * to lock the pages in memory, so that zfs_uiomove won't * block. */ err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. * Starting at offset zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); } /* * Write 'size' bytes from the uio buffer. * To the specified object. * Starting at offset zfs_uio_offset(uio). */ int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } #endif /* _KERNEL */ /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); arc_buf_destroy(buf, FTAG); } /* * A "lightweight" write is faster than a regular write (e.g. * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t. However, the * data can not be read or overwritten until the transaction's txg has been * synced. This makes it appropriate for workloads that are known to be * (temporarily) write-only, like "zfs receive". * * A single block is written, starting at the specified offset in bytes. If * the call is successful, it returns 0 and the provided abd has been * consumed (the caller should not free it). */ int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx) { dbuf_dirty_record_t *dr = dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx); if (dr == NULL) return (SET_ERROR(EIO)); dr->dt.dll.dr_abd = abd; dr->dt.dll.dr_props = *zp; dr->dt.dll.dr_flags = flags; return (0); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *db; objset_t *os = dn->dn_objset; uint64_t object = dn->dn_object; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, FTAG); if (db == NULL) return (SET_ERROR(EIO)); rw_exit(&dn->dn_struct_rwlock); /* * We can only assign if the offset is aligned and the arc buf is the * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { zfs_racct_write(blksz, 1); dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { /* compressed bufs must always be assignable to their dbuf */ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); } return (0); } int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { int err; dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; DB_DNODE_ENTER(dbuf); err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx); DB_DNODE_EXIT(dbuf); return (err); } typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; zgd_t *dsa_zgd; dmu_tx_t *dsa_tx; } dmu_sync_arg_t; static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { if (BP_IS_HOLE(bp)) { /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); BP_SET_FILL(bp, 1); } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; zgd_t *zgd = dsa->dsa_zgd; /* * Record the vdev(s) backing this blkptr so they can be flushed after * the writes for the lwb have completed. */ if (zio->io_error == 0) { zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); } mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint8_t chksum = BP_GET_CHECKSUM(bp_orig); ASSERT(BP_EQUAL(bp, bp_orig)); VERIFY(BP_EQUAL(bp, db->db_blkptr)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); VERIFY(zio_checksum_table[chksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; /* * Old style holes are filled with all zeros, whereas * new-style holes maintain their lsize, type, level, * and birth time (see zio_write_compress). While we * need to reset the BP_SET_LSIZE() call that happened * in dmu_sync_ready for old style holes, we do *not* * want to wipe out the information contained in new * style holes. Thus, only zero out the block pointer if * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && dr->dt.dl.dr_overridden_by.blk_birth == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; zgd_t *zgd = dsa->dsa_zgd; if (zio->io_error == 0) { /* * Record the vdev(s) backing this blkptr so they can be * flushed after the writes for the lwb have completed. */ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); if (!BP_IS_HOLE(bp)) { blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); abd_free(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); } /* * In order to prevent the zgd's lwb from being free'd prior to * dmu_sync_late_arrival_done() being called, we have to ensure * the lwb's "max txg" takes this tx's txg into account. */ zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; /* * Since we are currently syncing this txg, it's nontrivial to * determine what BP to nopwrite against, so we disable nopwrite. * * When syncing, the db_blkptr is initially the BP of the previous * txg. We can not nopwrite against it because it will be changed * (this is similar to the non-late-arrival case where the dbuf is * dirty in a future txg). * * Then dbuf_write_ready() sets bp_blkptr to the location we will write. * We can not nopwrite against it because although the BP will not * (typically) be changed, the data has not yet been persisted to this * location. * * Finally, when dbuf_write_done() is called, it is theoretically * possible to always nopwrite, because the data that was written in * this txg is the same data that we are trying to write. However we * would need to check that this dbuf is not dirty in any future * txg's (as we do in the normal dmu_sync() path). For simplicity, we * don't nopwrite in this case. */ zp->zp_nopwrite = B_FALSE; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, - dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, + dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr, *dr_next; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } dr_next = list_next(&db->db_dirty_records, dr); ASSERT(dr_next == NULL || dr_next->dr_txg < txg); if (db->db_blkptr != NULL) { /* * We need to fill in zgd_bp with the current blkptr so that * the nopwrite code can check if we're writing the same * data that's already on disk. We can only nopwrite if we * are sure that after making the copy, db_blkptr will not * change until our i/o completes. We ensure this by * holding the db_mtx, and only allowing nopwrite if the * block is not already dirty (see below). This is verified * by dmu_sync_done(), which VERIFYs that the db_blkptr has * not changed. */ *zgd->zgd_bp = *db->db_blkptr; } /* * Assume the on-disk data is X, the current syncing data (in * txg - 1) is Y, and the current in-memory data is Z (currently * in dmu_sync). * * We usually want to perform a nopwrite if X and Z are the * same. However, if Y is different (i.e. the BP is going to * change before this write takes effect), then a nopwrite will * be incorrect - we would override with X, which could have * been freed when Y was written. * * (Note that this is not a concern when we are nop-writing from * syncing context, because X and Y must be identical, because * all previous txgs have been synced.) * * Therefore, we disable nopwrite if the current BP could change * before this TXG. There are two ways it could change: by * being dirty (dr_next is non-NULL), or by being freed * (dnode_block_freed()). This behavior is verified by * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EALREADY)); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), - &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, + &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_nlevels(dn, nlevels, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (0); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's checksum function. This * check ensures that the receiving system can understand the * checksum function transmitted. */ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's compression function. This * check ensures that the receiving system can understand the * compression function transmitted. */ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ static const int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; uint8_t complevel = os->os_complevel; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; boolean_t encrypt = B_FALSE; int copies = os->os_copies; /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_METADATA) || (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; switch (os->os_redundant_metadata) { case ZFS_REDUNDANT_METADATA_ALL: copies++; break; case ZFS_REDUNDANT_METADATA_MOST: if (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)) copies++; break; case ZFS_REDUNDANT_METADATA_SOME: if (DMU_OT_IS_CRITICAL(type)) copies++; break; case ZFS_REDUNDANT_METADATA_NONE: break; } } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_OFF; } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); complevel = zio_complevel_select(os->os_spa, compress, complevel, complevel); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checksum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and * compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually * exclusive. */ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } /* * All objects in an encrypted objset are protected from modification * via a MAC. Encrypted objects store their IV and salt in the last DVA * in the bp, so we cannot use all copies. Encrypted objects are also * not subject to nopwrite since writing the same data will still * result in a new ciphertext. Only encrypted blocks can be dedup'd * to avoid ambiguity in the dedup code since the DDT does not store * object types. */ if (os->os_encrypted && (wp & WP_NOFILL) == 0) { encrypt = B_TRUE; if (DMU_OT_IS_ENCRYPTED(type)) { copies = MIN(copies, SPA_DVAS_PER_BP - 1); nopwrite = B_FALSE; } else { dedup = B_FALSE; } if (level <= 0 && (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) { compress = ZIO_COMPRESS_EMPTY; } } zp->zp_compress = compress; zp->zp_complevel = complevel; zp->zp_checksum = checksum; zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? os->os_zpl_special_smallblock : 0; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); } /* * Reports the location of data and holes in an object. In order to * accurately report holes all dirty data must be synced to disk. This * causes extremely poor performance when seeking for holes in a dirty file. * As a compromise, only provide hole data when the dnode is clean. When * a dnode is dirty report the dnode as having no holes by returning EBUSY * which is always safe to do. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int restarted = 0, err; restart: err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dnode_is_dirty(dn)) { /* * If the zfs_dmu_offset_next_sync module option is enabled * then hole reporting has been requested. Dirty dnodes * must be synced to disk to accurately report holes. * * Provided a RL_READER rangelock spanning 0-UINT64_MAX is * held by the caller only a single restart will be required. * We tolerate callers which do not hold the rangelock by * returning EBUSY and not reporting holes after one restart. */ if (zfs_dmu_offset_next_sync) { rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (restarted) return (SET_ERROR(EBUSY)); txg_wait_synced(dmu_objset_pool(os), 0); restarted = 1; goto restart; } err = SET_ERROR(EBUSY); } else { err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK | (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (err); } int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, blkptr_t *bps, size_t *nbpsp) { dmu_buf_t **dbp, *dbuf; dmu_buf_impl_t *db; blkptr_t *bp; int error, numbufs; error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG, &numbufs, &dbp); if (error != 0) { if (error == ESRCH) { error = SET_ERROR(ENXIO); } return (error); } ASSERT3U(numbufs, <=, *nbpsp); for (int i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; mutex_enter(&db->db_mtx); if (!list_is_empty(&db->db_dirty_records)) { dbuf_dirty_record_t *dr; dr = list_head(&db->db_dirty_records); if (dr->dt.dl.dr_brtwrite) { /* * This is very special case where we clone a * block and in the same transaction group we * read its BP (most likely to clone the clone). */ bp = &dr->dt.dl.dr_overridden_by; } else { /* * The block was modified in the same * transaction group. */ mutex_exit(&db->db_mtx); error = SET_ERROR(EAGAIN); goto out; } } else { bp = db->db_blkptr; } mutex_exit(&db->db_mtx); if (bp == NULL) { /* * The block was created in this transaction group, * so it has no BP yet. */ error = SET_ERROR(EAGAIN); goto out; } /* * Make sure we clone only data blocks. */ if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) { error = SET_ERROR(EINVAL); goto out; } bps[i] = *bp; } *nbpsp = numbufs; out: dmu_buf_rele_array(dbp, numbufs, FTAG); return (error); } int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay) { spa_t *spa; dmu_buf_t **dbp, *dbuf; dmu_buf_impl_t *db; struct dirty_leaf *dl; dbuf_dirty_record_t *dr; const blkptr_t *bp; int error = 0, i, numbufs; spa = os->os_spa; VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG, &numbufs, &dbp)); ASSERT3U(nbps, ==, numbufs); /* * Before we start cloning make sure that the dbufs sizes match new BPs * sizes. If they don't, that's a no-go, as we are not able to shrink * dbufs. */ for (i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_blkid != DMU_SPILL_BLKID); if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) { error = SET_ERROR(EXDEV); goto out; } } for (i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_blkid != DMU_SPILL_BLKID); ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); dmu_buf_will_clone(dbuf, tx); mutex_enter(&db->db_mtx); dr = list_head(&db->db_dirty_records); VERIFY(dr != NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; dl->dr_overridden_by = *bp; dl->dr_brtwrite = B_TRUE; dl->dr_override_state = DR_OVERRIDDEN; if (BP_IS_HOLE(bp)) { dl->dr_overridden_by.blk_birth = 0; dl->dr_overridden_by.blk_phys_birth = 0; } else { dl->dr_overridden_by.blk_birth = dr->dr_txg; if (!BP_IS_EMBEDDED(bp)) { dl->dr_overridden_by.blk_phys_birth = BP_PHYSICAL_BIRTH(bp); } } mutex_exit(&db->db_mtx); /* * When data in embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as * it contains the data. * Also, when replaying ZIL we don't want to bump references * in the BRT as it was already done during ZIL claim. */ if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { brt_pending_add(spa, bp, tx); } } out: dmu_buf_rele_array(dbp, numbufs, FTAG); return (error); } void __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp = dn->dn_phys; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); __dmu_object_info_from_dnode(dn, doi); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add in number of slots used for the dnode itself */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + dn->dn_num_slots; DB_DNODE_EXIT(db); } void dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *dnsize = dn->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } void byteswap_uint8_array(void *vbuf, size_t size) { (void) vbuf, (void) size; } void dmu_init(void) { abd_init(); zfs_dbgmsg_init(); sa_cache_init(); dmu_objset_init(); dnode_init(); zfetch_init(); dmu_tx_init(); l2arc_init(); arc_init(); dbuf_init(); } void dmu_fini(void) { arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); dmu_tx_fini(); zfetch_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); abd_fini(); } EXPORT_SYMBOL(dmu_bonus_hold); EXPORT_SYMBOL(dmu_bonus_hold_by_dnode); EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); EXPORT_SYMBOL(dmu_buf_rele_array); EXPORT_SYMBOL(dmu_prefetch); EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_read_by_dnode); EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write_by_dnode); EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_object_info); EXPORT_SYMBOL(dmu_object_info_from_dnode); EXPORT_SYMBOL(dmu_object_info_from_db); EXPORT_SYMBOL(dmu_object_size_from_db); EXPORT_SYMBOL(dmu_object_dnsize_from_db); EXPORT_SYMBOL(dmu_object_set_nlevels); EXPORT_SYMBOL(dmu_object_set_blocksize); EXPORT_SYMBOL(dmu_object_set_maxblkid); EXPORT_SYMBOL(dmu_object_set_checksum); EXPORT_SYMBOL(dmu_object_set_compress); EXPORT_SYMBOL(dmu_offset_next); EXPORT_SYMBOL(dmu_write_policy); EXPORT_SYMBOL(dmu_sync); EXPORT_SYMBOL(dmu_request_arcbuf); EXPORT_SYMBOL(dmu_return_arcbuf); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf); EXPORT_SYMBOL(dmu_buf_hold); EXPORT_SYMBOL(dmu_ot); ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW, "Enable NOP writes"); ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW, "Percentage of dirtied blocks from frees in one TXG"); ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW, "Enable forcing txg sync to find holes"); /* CSTYLED */ ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW, "Limit one prefetch call to this size"); diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index 778b18817eef..d134d4958f7c 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -1,3083 +1,3083 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2015, STRATO AG, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include #include /* * Needed to close a window in dnode_move() that allows the objset to be freed * before it can be safely accessed. */ krwlock_t os_lock; /* * Tunable to overwrite the maximum number of threads for the parallelization * of dmu_objset_find_dp, needed to speed up the import of pools with many * datasets. * Default is 4 times the number of leaf vdevs. */ static const int dmu_find_threads = 0; /* * Backfill lower metadnode objects after this many have been freed. * Backfilling negatively impacts object creation rates, so only do it * if there are enough holes to fill. */ static const int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT; static const char *upgrade_tag = "upgrade_tag"; static void dmu_objset_find_dp_cb(void *arg); static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb); static void dmu_objset_upgrade_stop(objset_t *os); void dmu_objset_init(void) { rw_init(&os_lock, NULL, RW_DEFAULT, NULL); } void dmu_objset_fini(void) { rw_destroy(&os_lock); } spa_t * dmu_objset_spa(objset_t *os) { return (os->os_spa); } zilog_t * dmu_objset_zil(objset_t *os) { return (os->os_zil); } dsl_pool_t * dmu_objset_pool(objset_t *os) { dsl_dataset_t *ds; if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) return (ds->ds_dir->dd_pool); else return (spa_get_dsl(os->os_spa)); } dsl_dataset_t * dmu_objset_ds(objset_t *os) { return (os->os_dsl_dataset); } dmu_objset_type_t dmu_objset_type(objset_t *os) { return (os->os_phys->os_type); } void dmu_objset_name(objset_t *os, char *buf) { dsl_dataset_name(os->os_dsl_dataset, buf); } uint64_t dmu_objset_id(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; return (ds ? ds->ds_object : 0); } uint64_t dmu_objset_dnodesize(objset_t *os) { return (os->os_dnodesize); } zfs_sync_type_t dmu_objset_syncprop(objset_t *os) { return (os->os_sync); } zfs_logbias_op_t dmu_objset_logbias(objset_t *os) { return (os->os_logbias); } static void checksum_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); } static void compression_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval != ZIO_COMPRESS_INHERIT); os->os_compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ALGO(newval), ZIO_COMPRESS_ON); os->os_complevel = zio_complevel_select(os->os_spa, os->os_compress, ZIO_COMPRESS_LEVEL(newval), ZIO_COMPLEVEL_DEFAULT); } static void copies_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval > 0); ASSERT(newval <= spa_max_replication(os->os_spa)); os->os_copies = newval; } static void dedup_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; spa_t *spa = os->os_spa; enum zio_checksum checksum; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); } static void primary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_primary_cache = newval; } static void secondary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_secondary_cache = newval; } static void sync_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || newval == ZFS_SYNC_DISABLED); os->os_sync = newval; if (os->os_zil) zil_set_sync(os->os_zil, newval); } static void redundant_metadata_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || newval == ZFS_REDUNDANT_METADATA_MOST || newval == ZFS_REDUNDANT_METADATA_SOME || newval == ZFS_REDUNDANT_METADATA_NONE); os->os_redundant_metadata = newval; } static void dnodesize_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; switch (newval) { case ZFS_DNSIZE_LEGACY: os->os_dnodesize = DNODE_MIN_SIZE; break; case ZFS_DNSIZE_AUTO: /* * Choose a dnode size that will work well for most * workloads if the user specified "auto". Future code * improvements could dynamically select a dnode size * based on observed workload patterns. */ os->os_dnodesize = DNODE_MIN_SIZE * 2; break; case ZFS_DNSIZE_1K: case ZFS_DNSIZE_2K: case ZFS_DNSIZE_4K: case ZFS_DNSIZE_8K: case ZFS_DNSIZE_16K: os->os_dnodesize = newval; break; } } static void smallblk_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval <= SPA_MAXBLOCKSIZE); ASSERT(ISP2(newval)); os->os_zpl_special_smallblock = newval; } static void logbias_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; ASSERT(newval == ZFS_LOGBIAS_LATENCY || newval == ZFS_LOGBIAS_THROUGHPUT); os->os_logbias = newval; if (os->os_zil) zil_set_logbias(os->os_zil, newval); } static void recordsize_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; os->os_recordsize = newval; } void dmu_objset_byteswap(void *buf, size_t size) { objset_phys_t *osp = buf; ASSERT(size == OBJSET_PHYS_SIZE_V1 || size == OBJSET_PHYS_SIZE_V2 || size == sizeof (objset_phys_t)); dnode_byteswap(&osp->os_meta_dnode); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); osp->os_type = BSWAP_64(osp->os_type); osp->os_flags = BSWAP_64(osp->os_flags); if (size >= OBJSET_PHYS_SIZE_V2) { dnode_byteswap(&osp->os_userused_dnode); dnode_byteswap(&osp->os_groupused_dnode); if (size >= sizeof (objset_phys_t)) dnode_byteswap(&osp->os_projectused_dnode); } } /* * The hash is a CRC-based hash of the objset_t pointer and the object number. */ static uint64_t dnode_hash(const objset_t *os, uint64_t obj) { uintptr_t osv = (uintptr_t)os; uint64_t crc = -1ULL; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); /* * The low 6 bits of the pointer don't have much entropy, because * the objset_t is larger than 2^6 bytes long. */ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; crc ^= (osv>>14) ^ (obj>>24); return (crc); } static unsigned int dnode_multilist_index_func(multilist_t *ml, void *obj) { dnode_t *dn = obj; /* * The low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. In this context full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)dnode_hash(dn->dn_objset, dn->dn_object) % multilist_get_num_sublists(ml)); } static inline boolean_t dmu_os_is_l2cacheable(objset_t *os) { if (os->os_secondary_cache == ZFS_CACHE_ALL || os->os_secondary_cache == ZFS_CACHE_METADATA) { if (l2arc_exclude_special == 0) return (B_TRUE); blkptr_t *bp = os->os_rootbp; if (bp == NULL || BP_IS_HOLE(bp)) return (B_FALSE); uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); vdev_t *rvd = os->os_spa->spa_root_vdev; vdev_t *vd = NULL; if (vdev < rvd->vdev_children) vd = rvd->vdev_child[vdev]; if (vd == NULL) return (B_TRUE); if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) return (B_TRUE); } return (B_FALSE); } /* * Instantiates the objset_t in-memory structure corresponding to the * objset_phys_t that's pointed to by the specified blkptr_t. */ int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, objset_t **osp) { objset_t *os; int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); ASSERT(!BP_IS_REDACTED(bp)); /* * We need the pool config lock to get properties. */ ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool)); /* * The $ORIGIN dataset (if it exists) doesn't have an associated * objset, so there's no reason to open it. The $ORIGIN dataset * will not exist on pools older than SPA_VERSION_ORIGIN. */ if (ds != NULL && spa_get_dsl(spa) != NULL && spa_get_dsl(spa)->dp_origin_snap != NULL) { ASSERT3P(ds->ds_dir, !=, spa_get_dsl(spa)->dp_origin_snap->ds_dir); } os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; os->os_spa = spa; os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; int size; zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (dmu_os_is_l2cacheable(os)) aflags |= ARC_FLAG_L2CACHE; if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) { ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); ASSERT(BP_IS_AUTHENTICATED(bp)); zio_flags |= ZIO_FLAG_RAW; } dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (err != 0) { kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } if (spa_version(spa) < SPA_VERSION_USERSPACE) size = OBJSET_PHYS_SIZE_V1; else if (!spa_feature_is_enabled(spa, SPA_FEATURE_PROJECT_QUOTA)) size = OBJSET_PHYS_SIZE_V2; else size = sizeof (objset_phys_t); /* Increase the blocksize if we are permitted. */ if (arc_buf_size(os->os_phys_buf) < size) { arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf, ARC_BUFC_METADATA, size); memset(buf->b_data, 0, size); memcpy(buf->b_data, os->os_phys_buf->b_data, arc_buf_size(os->os_phys_buf)); arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } os->os_phys = os->os_phys_buf->b_data; os->os_flags = os->os_phys->os_flags; } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_PHYS_SIZE_V1; os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf, ARC_BUFC_METADATA, size); os->os_phys = os->os_phys_buf->b_data; memset(os->os_phys, 0, size); } /* * These properties will be filled in by the logic in zfs_get_zplprop() * when they are queried for the first time. */ os->os_version = OBJSET_PROP_UNINITIALIZED; os->os_normalization = OBJSET_PROP_UNINITIALIZED; os->os_utf8only = OBJSET_PROP_UNINITIALIZED; os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED; /* * Note: the changed_cb will be called once before the register * func returns, thus changing the checksum/compression from the * default (fletcher2/off). Snapshots don't need to know about * checksum/compression/copies. */ if (ds != NULL) { os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0); err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name( ZFS_PROP_REDUNDANT_METADATA), redundant_metadata_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), recordsize_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DNODESIZE), dnodesize_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name( ZFS_PROP_SPECIAL_SMALL_BLOCKS), smallblk_changed_cb, os); } } if (err != 0) { arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); return (err); } } else { /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_ON; os->os_complevel = ZIO_COMPLEVEL_DEFAULT; os->os_encrypted = B_FALSE; os->os_copies = spa_max_replication(spa); os->os_dedup_checksum = ZIO_CHECKSUM_OFF; os->os_dedup_verify = B_FALSE; os->os_logbias = ZFS_LOGBIAS_LATENCY; os->os_sync = ZFS_SYNC_STANDARD; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; os->os_dnodesize = DNODE_MIN_SIZE; } if (ds == NULL || !ds->ds_is_snapshot) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { multilist_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i]), dnode_multilist_index_func); } list_create(&os->os_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_link)); list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); list_link_init(&os->os_evicting_node); mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); os->os_obj_next_percpu_len = boot_ncpus; os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]), KM_SLEEP); dnode_special_open(os, &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (OBJSET_BUF_HAS_USERUSED(os->os_phys_buf)) { dnode_special_open(os, &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, &os->os_userused_dnode); dnode_special_open(os, &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); if (OBJSET_BUF_HAS_PROJECTUSED(os->os_phys_buf)) dnode_special_open(os, &os->os_phys->os_projectused_dnode, DMU_PROJECTUSED_OBJECT, &os->os_projectused_dnode); } mutex_init(&os->os_upgrade_lock, NULL, MUTEX_DEFAULT, NULL); *osp = os; return (0); } int dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { int err = 0; /* * We need the pool_config lock to manipulate the dsl_dataset_t. * Even if the dataset is long-held, we need the pool_config lock * to open the objset, as it needs to get properties. */ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { objset_t *os; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, dsl_dataset_get_blkptr(ds), &os); rrw_exit(&ds->ds_bp_rwlock, FTAG); if (err == 0) { mutex_enter(&ds->ds_lock); ASSERT(ds->ds_objset == NULL); ds->ds_objset = os; mutex_exit(&ds->ds_lock); } } *osp = ds->ds_objset; mutex_exit(&ds->ds_opening_lock); return (err); } /* * Holds the pool while the objset is held. Therefore only one objset * can be held at a time. */ int dmu_objset_hold_flags(const char *name, boolean_t decrypt, const void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; ds_hold_flags_t flags; flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, tag, &dp); if (err != 0) return (err); err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } err = dmu_objset_from_ds(ds, osp); if (err != 0) { dsl_dataset_rele(ds, tag); dsl_pool_rele(dp, tag); } return (err); } int dmu_objset_hold(const char *name, const void *tag, objset_t **osp) { return (dmu_objset_hold_flags(name, B_FALSE, tag, osp)); } static int dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) { (void) tag; int err = dmu_objset_from_ds(ds, osp); if (err != 0) { return (err); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { return (SET_ERROR(EINVAL)); } else if (!readonly && dsl_dataset_is_snapshot(ds)) { return (SET_ERROR(EROFS)); } else if (!readonly && decrypt && dsl_dir_incompatible_encryption_version(ds->ds_dir)) { return (SET_ERROR(EROFS)); } /* if we are decrypting, we can now check MACs in os->os_phys_buf */ if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) { zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa, &zb, B_FALSE); if (err != 0) return (err); ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf)); } return (0); } /* * dsl_pool must not be held when this is called. * Upon successful return, there will be a longhold on the dataset, * and the dsl_pool will not be held. */ int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; ds_hold_flags_t flags; flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_own(dp, name, flags, tag, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp); if (err != 0) { dsl_dataset_disown(ds, flags, tag); dsl_pool_rele(dp, FTAG); return (err); } /* * User accounting requires the dataset to be decrypted and rw. * We also don't begin user accounting during claiming to help * speed up pool import times and to keep this txg reserved * completely for recovery work. */ if (!readonly && !dp->dp_spa->spa_claiming && (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) { if (dmu_objset_userobjspace_upgradable(*osp) || dmu_objset_projectquota_upgradable(*osp)) { dmu_objset_id_quota_upgrade(*osp); } else if (dmu_objset_userused_enabled(*osp)) { dmu_objset_userspace_upgrade(*osp); } } dsl_pool_rele(dp, FTAG); return (0); } int dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) { dsl_dataset_t *ds; int err; ds_hold_flags_t flags; flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds); if (err != 0) return (err); err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp); if (err != 0) { dsl_dataset_disown(ds, flags, tag); return (err); } return (0); } void dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, const void *tag) { ds_hold_flags_t flags; dsl_pool_t *dp = dmu_objset_pool(os); flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag); dsl_pool_rele(dp, tag); } void dmu_objset_rele(objset_t *os, const void *tag) { dmu_objset_rele_flags(os, B_FALSE, tag); } /* * When we are called, os MUST refer to an objset associated with a dataset * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner * == tag. We will then release and reacquire ownership of the dataset while * holding the pool config_rwlock to avoid intervening namespace or ownership * changes may occur. * * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to * release the hold on its dataset and acquire a new one on the dataset of the * same name so that it can be partially torn down and reconstructed. */ void dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, boolean_t decrypt, const void *tag) { dsl_pool_t *dp; char name[ZFS_MAX_DATASET_NAME_LEN]; ds_hold_flags_t flags; flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; VERIFY3P(ds, !=, NULL); VERIFY3P(ds->ds_owner, ==, tag); VERIFY(dsl_dataset_long_held(ds)); dsl_dataset_name(ds, name); dp = ds->ds_dir->dd_pool; dsl_pool_config_enter(dp, FTAG); dsl_dataset_disown(ds, flags, tag); VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds)); dsl_pool_config_exit(dp, FTAG); } void dmu_objset_disown(objset_t *os, boolean_t decrypt, const void *tag) { ds_hold_flags_t flags; flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; /* * Stop upgrading thread */ dmu_objset_upgrade_stop(os); dsl_dataset_disown(os->os_dsl_dataset, flags, tag); } void dmu_objset_evict_dbufs(objset_t *os) { dnode_t *dn_marker; dnode_t *dn; dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP); mutex_enter(&os->os_lock); dn = list_head(&os->os_dnodes); while (dn != NULL) { /* * Skip dnodes without holds. We have to do this dance * because dnode_add_ref() only works if there is already a * hold. If the dnode has no holds, then it has no dbufs. */ if (dnode_add_ref(dn, FTAG)) { list_insert_after(&os->os_dnodes, dn, dn_marker); mutex_exit(&os->os_lock); dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); mutex_enter(&os->os_lock); dn = list_next(&os->os_dnodes, dn_marker); list_remove(&os->os_dnodes, dn_marker); } else { dn = list_next(&os->os_dnodes, dn); } } mutex_exit(&os->os_lock); kmem_free(dn_marker, sizeof (dnode_t)); if (DMU_USERUSED_DNODE(os) != NULL) { if (DMU_PROJECTUSED_DNODE(os) != NULL) dnode_evict_dbufs(DMU_PROJECTUSED_DNODE(os)); dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); } dnode_evict_dbufs(DMU_META_DNODE(os)); } /* * Objset eviction processing is split into into two pieces. * The first marks the objset as evicting, evicts any dbufs that * have a refcount of zero, and then queues up the objset for the * second phase of eviction. Once os->os_dnodes has been cleared by * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. * The second phase closes the special dnodes, dequeues the objset from * the list of those undergoing eviction, and finally frees the objset. * * NOTE: Due to asynchronous eviction processing (invocation of * dnode_buf_pageout()), it is possible for the meta dnode for the * objset to have no holds even though os->os_dnodes is not empty. */ void dmu_objset_evict(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; for (int t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) dsl_prop_unregister_all(ds, os); if (os->os_sa) sa_tear_down(os); dmu_objset_evict_dbufs(os); mutex_enter(&os->os_lock); spa_evicting_os_register(os->os_spa, os); if (list_is_empty(&os->os_dnodes)) { mutex_exit(&os->os_lock); dmu_objset_evict_done(os); } else { mutex_exit(&os->os_lock); } } void dmu_objset_evict_done(objset_t *os) { ASSERT3P(list_head(&os->os_dnodes), ==, NULL); dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { if (DMU_PROJECTUSED_DNODE(os)) dnode_special_close(&os->os_projectused_dnode); dnode_special_close(&os->os_userused_dnode); dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); /* * This is a barrier to prevent the objset from going away in * dnode_move() until we can safely ensure that the objset is still in * use. We consider the objset valid before the barrier and invalid * after the barrier. */ rw_enter(&os_lock, RW_READER); rw_exit(&os_lock); kmem_free(os->os_obj_next_percpu, os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0])); mutex_destroy(&os->os_lock); mutex_destroy(&os->os_userused_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); mutex_destroy(&os->os_upgrade_lock); for (int i = 0; i < TXG_SIZE; i++) multilist_destroy(&os->os_dirty_dnodes[i]); spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } inode_timespec_t dmu_objset_snap_cmtime(objset_t *os) { return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); } objset_t * dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx) { objset_t *os; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); if (blksz == 0) blksz = DNODE_BLOCK_SIZE; if (ibs == 0) ibs = DN_MAX_INDBLKSHIFT; if (ds != NULL) VERIFY0(dmu_objset_from_ds(ds, &os)); else VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx); /* * We don't want to have to increase the meta-dnode's nlevels * later, because then we could do it in quiescing context while * we are also accessing it in open context. * * This precaution is not necessary for the MOS (ds == NULL), * because the MOS is only updated in syncing context. * This is most fortunate: the MOS is the only objset that * needs to be synced multiple times as spa_sync() iterates * to convergence, so minimizing its dn_nlevels matters. */ if (ds != NULL) { if (levels == 0) { levels = 1; /* * Determine the number of levels necessary for the * meta-dnode to contain DN_MAX_OBJECT dnodes. Note * that in order to ensure that we do not overflow * 64 bits, there has to be a nlevels that gives us a * number of blocks > DN_MAX_OBJECT but < 2^64. * Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) * (10) must be less than (64 - log2(DN_MAX_OBJECT)) * (16). */ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < DN_MAX_OBJECT) levels++; } mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = mdn->dn_nlevels = levels; } ASSERT(type != DMU_OST_NONE); ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); os->os_phys->os_type = type; /* * Enable user accounting if it is enabled and this is not an * encrypted receive. */ if (dmu_objset_userused_enabled(os) && (!os->os_encrypted || !dmu_objset_is_receiving(os))) { os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; if (dmu_objset_userobjused_enabled(os)) { ASSERT3P(ds, !=, NULL); ds->ds_feature_activation[ SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; os->os_phys->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; } if (dmu_objset_projectquota_enabled(os)) { ASSERT3P(ds, !=, NULL); ds->ds_feature_activation[ SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; os->os_phys->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE; } os->os_flags = os->os_phys->os_flags; } dsl_dataset_dirty(ds, tx); return (os); } /* called from dsl for meta-objset */ objset_t * dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx) { return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx)); } typedef struct dmu_objset_create_arg { const char *doca_name; cred_t *doca_cred; proc_t *doca_proc; void (*doca_userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *doca_userarg; dmu_objset_type_t doca_type; uint64_t doca_flags; dsl_crypto_params_t *doca_dcp; } dmu_objset_create_arg_t; static int dmu_objset_create_check(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; dsl_dataset_t *parentds; objset_t *parentos; const char *tail; int error; if (strchr(doca->doca_name, '@') != NULL) return (SET_ERROR(EINVAL)); if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); if (dataset_nestcheck(doca->doca_name) != 0) return (SET_ERROR(ENAMETOOLONG)); error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp, NULL); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (error); } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred, doca->doca_proc); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (error); } /* can't create below anything but filesystems (eg. no ZVOLs) */ error = dsl_dataset_hold_obj(pdd->dd_pool, dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (error); } error = dmu_objset_from_ds(parentds, &parentos); if (error != 0) { dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(pdd, FTAG); return (error); } if (dmu_objset_type(parentos) != DMU_OST_ZFS) { dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(pdd, FTAG); return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); } dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(pdd, FTAG); return (error); } static void dmu_objset_create_sync(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; dsl_dir_t *pdd; const char *tail; dsl_dataset_t *ds; uint64_t obj; blkptr_t *bp; objset_t *os; zio_t *rzio; VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, doca->doca_cred, doca->doca_dcp, tx); VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); bp = dsl_dataset_get_blkptr(ds); os = dmu_objset_create_impl(spa, ds, bp, doca->doca_type, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); if (doca->doca_userfunc != NULL) { doca->doca_userfunc(os, doca->doca_userarg, doca->doca_cred, tx); } /* * The doca_userfunc() may write out some data that needs to be * encrypted if the dataset is encrypted (specifically the root * directory). This data must be written out before the encryption * key mapping is removed by dsl_dataset_rele_flags(). Force the * I/O to occur immediately by invoking the relevant sections of * dsl_pool_sync(). */ if (os->os_encrypted) { dsl_dataset_t *tmpds = NULL; boolean_t need_sync_done = B_FALSE; mutex_enter(&ds->ds_lock); ds->ds_owner = FTAG; mutex_exit(&ds->ds_lock); rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds, tx->tx_txg); if (tmpds != NULL) { dsl_dataset_sync(ds, rzio, tx); need_sync_done = B_TRUE; } VERIFY0(zio_wait(rzio)); dmu_objset_sync_done(os, tx); taskq_wait(dp->dp_sync_taskq); if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(spa, ds->ds_key_mapping, ds); } rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds, tx->tx_txg); if (tmpds != NULL) { dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, rzio, tx); } VERIFY0(zio_wait(rzio)); if (need_sync_done) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(spa, ds->ds_key_mapping, ds); dsl_dataset_sync_done(ds, tx); dmu_buf_rele(ds->ds_dbuf, ds); } mutex_enter(&ds->ds_lock); ds->ds_owner = NULL; mutex_exit(&ds->ds_lock); } spa_history_log_internal_ds(ds, "create", tx, " "); dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg) { dmu_objset_create_arg_t doca; dsl_crypto_params_t tmp_dcp = { 0 }; doca.doca_name = name; doca.doca_cred = CRED(); doca.doca_proc = curproc; doca.doca_flags = flags; doca.doca_userfunc = func; doca.doca_userarg = arg; doca.doca_type = type; /* * Some callers (mostly for testing) do not provide a dcp on their * own but various code inside the sync task will require it to be * allocated. Rather than adding NULL checks throughout this code * or adding dummy dcp's to all of the callers we simply create a * dummy one here and use that. This zero dcp will have the same * effect as asking for inheritance of all encryption params. */ doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp; int rv = dsl_sync_task(name, dmu_objset_create_check, dmu_objset_create_sync, &doca, 6, ZFS_SPACE_CHECK_NORMAL); if (rv == 0) zvol_create_minor(name); return (rv); } typedef struct dmu_objset_clone_arg { const char *doca_clone; const char *doca_origin; cred_t *doca_cred; proc_t *doca_proc; } dmu_objset_clone_arg_t; static int dmu_objset_clone_check(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_dir_t *pdd; const char *tail; int error; dsl_dataset_t *origin; dsl_pool_t *dp = dmu_tx_pool(tx); if (strchr(doca->doca_clone, '@') != NULL) return (SET_ERROR(EINVAL)); if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred, doca->doca_proc); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EDQUOT)); } error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (error); } /* You can only clone snapshots, not the head datasets. */ if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EINVAL)); } dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); return (0); } static void dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; dsl_dataset_t *origin, *ds; uint64_t obj; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); obj = dsl_dataset_create_sync(pdd, tail, origin, 0, doca->doca_cred, NULL, tx); VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); dsl_dataset_name(origin, namebuf); spa_history_log_internal_ds(ds, "clone", tx, "origin=%s (%llu)", namebuf, (u_longlong_t)origin->ds_object); dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_clone(const char *clone, const char *origin) { dmu_objset_clone_arg_t doca; doca.doca_clone = clone; doca.doca_origin = origin; doca.doca_cred = CRED(); doca.doca_proc = curproc; int rv = dsl_sync_task(clone, dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 6, ZFS_SPACE_CHECK_NORMAL); if (rv == 0) zvol_create_minor(clone); return (rv); } int dmu_objset_snapshot_one(const char *fsname, const char *snapname) { int err; char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); nvlist_t *snaps = fnvlist_alloc(); fnvlist_add_boolean(snaps, longsnap); kmem_strfree(longsnap); err = dsl_dataset_snapshot(snaps, NULL, NULL); fnvlist_free(snaps); return (err); } static void dmu_objset_upgrade_task_cb(void *data) { objset_t *os = data; mutex_enter(&os->os_upgrade_lock); os->os_upgrade_status = EINTR; if (!os->os_upgrade_exit) { int status; mutex_exit(&os->os_upgrade_lock); status = os->os_upgrade_cb(os); mutex_enter(&os->os_upgrade_lock); os->os_upgrade_status = status; } os->os_upgrade_exit = B_TRUE; os->os_upgrade_id = 0; mutex_exit(&os->os_upgrade_lock); dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb) { if (os->os_upgrade_id != 0) return; ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag); mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) { os->os_upgrade_exit = B_FALSE; os->os_upgrade_cb = cb; os->os_upgrade_id = taskq_dispatch( os->os_spa->spa_upgrade_taskq, dmu_objset_upgrade_task_cb, os, TQ_SLEEP); if (os->os_upgrade_id == TASKQID_INVALID) { dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); os->os_upgrade_status = ENOMEM; } } else { dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } mutex_exit(&os->os_upgrade_lock); } static void dmu_objset_upgrade_stop(objset_t *os) { mutex_enter(&os->os_upgrade_lock); os->os_upgrade_exit = B_TRUE; if (os->os_upgrade_id != 0) { taskqid_t id = os->os_upgrade_id; os->os_upgrade_id = 0; mutex_exit(&os->os_upgrade_lock); if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) { dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } txg_wait_synced(os->os_spa->spa_dsl_pool, 0); } else { mutex_exit(&os->os_upgrade_lock); } } static void dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) { dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_dbuf->db_data_pending); /* * Initialize dn_zio outside dnode_sync() because the * meta-dnode needs to set it outside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); multilist_sublist_remove(list, dn); /* * See the comment above dnode_rele_task() for an explanation * of why this dnode hold is always needed (even when not * doing user accounting). */ multilist_t *newlist = &dn->dn_objset->os_synced_dnodes; (void) dnode_add_ref(dn, newlist); multilist_insert(newlist, dn); dnode_sync(dn, tx); } } static void dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { (void) abuf; blkptr_t *bp = zio->io_bp; objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; uint64_t fill = 0; ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); ASSERT0(BP_GET_LEVEL(bp)); /* * Update rootbp fill count: it should be the number of objects * allocated in the object set (not counting the "special" * objects that are stored in the objset_phys_t -- the meta * dnode and user/group/project accounting objects). */ for (int i = 0; i < dnp->dn_nblkptr; i++) fill += BP_GET_FILL(&dnp->dn_blkptr[i]); BP_SET_FILL(bp, fill); if (os->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG); *os->os_rootbp = *bp; if (os->os_dsl_dataset != NULL) rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); } static void dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) { (void) abuf; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; objset_t *os = arg; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } kmem_free(bp, sizeof (*bp)); } typedef struct sync_dnodes_arg { multilist_t *sda_list; int sda_sublist_idx; multilist_t *sda_newlist; dmu_tx_t *sda_tx; } sync_dnodes_arg_t; static void sync_dnodes_task(void *arg) { sync_dnodes_arg_t *sda = arg; multilist_sublist_t *ms = multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); dmu_objset_sync_dnodes(ms, sda->sda_tx); multilist_sublist_unlock(ms); kmem_free(sda, sizeof (*sda)); } /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; list_t *list; dbuf_dirty_record_t *dr; int num_sublists; multilist_t *ml; blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); *blkptr_copy = *os->os_rootbp; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", (u_longlong_t)tx->tx_txg); ASSERT(dmu_tx_is_syncing(tx)); /* XXX the write_done callback should really give us the tx... */ os->os_synctx = tx; if (os->os_dsl_dataset == NULL) { /* * This is the MOS. If we have upgraded, * spa_max_replication() could change, so reset * os_copies here. */ os->os_copies = spa_max_replication(os->os_spa); } /* * Create the root block IO */ SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); arc_release(os->os_phys_buf, &os->os_phys_buf); dmu_write_policy(os, NULL, 0, 0, &zp); /* * If we are either claiming the ZIL or doing a raw receive, write * out the os_phys_buf raw. Neither of these actions will effect the * MAC at this point. */ if (os->os_raw_receive || os->os_next_write_raw[tx->tx_txg & TXG_MASK]) { ASSERT(os->os_encrypted); arc_convert_to_raw(os->os_phys_buf, os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER, DMU_OT_OBJSET, NULL, NULL, NULL); } zio = arc_write(pio, os->os_spa, tx->tx_txg, blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os), - &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, + &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* * Sync special dnodes - the parent IO for the sync is the root block */ DMU_META_DNODE(os)->dn_zio = zio; dnode_sync(DMU_META_DNODE(os), tx); os->os_phys->os_flags = os->os_flags; if (DMU_USERUSED_DNODE(os) && DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { DMU_USERUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_USERUSED_DNODE(os), tx); DMU_GROUPUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_GROUPUSED_DNODE(os), tx); } if (DMU_PROJECTUSED_DNODE(os) && DMU_PROJECTUSED_DNODE(os)->dn_type != DMU_OT_NONE) { DMU_PROJECTUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_PROJECTUSED_DNODE(os), tx); } txgoff = tx->tx_txg & TXG_MASK; /* * We must create the list here because it uses the * dn_dirty_link[] of this txg. But it may already * exist because we call dsl_dataset_sync() twice per txg. */ if (os->os_synced_dnodes.ml_sublists == NULL) { multilist_create(&os->os_synced_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[txgoff]), dnode_multilist_index_func); } else { ASSERT3U(os->os_synced_dnodes.ml_offset, ==, offsetof(dnode_t, dn_dirty_link[txgoff])); } ml = &os->os_dirty_dnodes[txgoff]; num_sublists = multilist_get_num_sublists(ml); for (int i = 0; i < num_sublists; i++) { if (multilist_sublist_is_empty_idx(ml, i)) continue; sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); sda->sda_list = ml; sda->sda_sublist_idx = i; sda->sda_tx = tx; (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, sync_dnodes_task, sda, 0); /* callback frees sda */ } taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while ((dr = list_remove_head(list)) != NULL) { ASSERT0(dr->dr_dbuf->db_level); zio_nowait(dr->dr_zio); } /* Enable dnode backfill if enough objects have been freed. */ if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { os->os_rescan_dnodes = B_TRUE; os->os_freed_dnodes = 0; } /* * Free intent log blocks up to this tx. */ zil_sync(os->os_zil, tx); os->os_phys->os_zil_header = os->os_zil_header; zio_nowait(zio); } boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg) { return (!multilist_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK])); } static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES]; void dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb) { file_cbs[ost] = cb; } int dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data, zfs_file_info_t *zfi) { file_info_cb_t *cb = file_cbs[os->os_phys->os_type]; if (cb == NULL) return (EINVAL); return (cb(bonustype, data, zfi)); } boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && file_cbs[os->os_phys->os_type] != NULL && DMU_USERUSED_DNODE(os) != NULL); } boolean_t dmu_objset_userobjused_enabled(objset_t *os) { return (dmu_objset_userused_enabled(os) && spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING)); } boolean_t dmu_objset_projectquota_enabled(objset_t *os) { return (file_cbs[os->os_phys->os_type] != NULL && DMU_PROJECTUSED_DNODE(os) != NULL && spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA)); } typedef struct userquota_node { /* must be in the first filed, see userquota_update_cache() */ char uqn_id[20 + DMU_OBJACCT_PREFIX_LEN]; int64_t uqn_delta; avl_node_t uqn_node; } userquota_node_t; typedef struct userquota_cache { avl_tree_t uqc_user_deltas; avl_tree_t uqc_group_deltas; avl_tree_t uqc_project_deltas; } userquota_cache_t; static int userquota_compare(const void *l, const void *r) { const userquota_node_t *luqn = l; const userquota_node_t *ruqn = r; int rv; /* * NB: can only access uqn_id because userquota_update_cache() doesn't * pass in an entire userquota_node_t. */ rv = strcmp(luqn->uqn_id, ruqn->uqn_id); return (TREE_ISIGN(rv)); } static void do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) { void *cookie; userquota_node_t *uqn; ASSERT(dmu_tx_is_syncing(tx)); cookie = NULL; while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas, &cookie)) != NULL) { /* * os_userused_lock protects against concurrent calls to * zap_increment_int(). It's needed because zap_increment_int() * is not thread-safe (i.e. not atomic). */ mutex_enter(&os->os_userused_lock); VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT, uqn->uqn_id, uqn->uqn_delta, tx)); mutex_exit(&os->os_userused_lock); kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_user_deltas); cookie = NULL; while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas, &cookie)) != NULL) { mutex_enter(&os->os_userused_lock); VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT, uqn->uqn_id, uqn->uqn_delta, tx)); mutex_exit(&os->os_userused_lock); kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_group_deltas); if (dmu_objset_projectquota_enabled(os)) { cookie = NULL; while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas, &cookie)) != NULL) { mutex_enter(&os->os_userused_lock); VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT, uqn->uqn_id, uqn->uqn_delta, tx)); mutex_exit(&os->os_userused_lock); kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_project_deltas); } } static void userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta) { userquota_node_t *uqn; avl_index_t idx; ASSERT(strlen(id) < sizeof (uqn->uqn_id)); /* * Use id directly for searching because uqn_id is the first field of * userquota_node_t and fields after uqn_id won't be accessed in * avl_find(). */ uqn = avl_find(avl, (const void *)id, &idx); if (uqn == NULL) { uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP); strlcpy(uqn->uqn_id, id, sizeof (uqn->uqn_id)); avl_insert(avl, uqn, idx); } uqn->uqn_delta += delta; } static void do_userquota_update(objset_t *os, userquota_cache_t *cache, uint64_t used, uint64_t flags, uint64_t user, uint64_t group, uint64_t project, boolean_t subtract) { if (flags & DNODE_FLAG_USERUSED_ACCOUNTED) { int64_t delta = DNODE_MIN_SIZE + used; char name[20]; if (subtract) delta = -delta; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)user); userquota_update_cache(&cache->uqc_user_deltas, name, delta); (void) snprintf(name, sizeof (name), "%llx", (longlong_t)group); userquota_update_cache(&cache->uqc_group_deltas, name, delta); if (dmu_objset_projectquota_enabled(os)) { (void) snprintf(name, sizeof (name), "%llx", (longlong_t)project); userquota_update_cache(&cache->uqc_project_deltas, name, delta); } } } static void do_userobjquota_update(objset_t *os, userquota_cache_t *cache, uint64_t flags, uint64_t user, uint64_t group, uint64_t project, boolean_t subtract) { if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) { char name[20 + DMU_OBJACCT_PREFIX_LEN]; int delta = subtract ? -1 : 1; (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx", (longlong_t)user); userquota_update_cache(&cache->uqc_user_deltas, name, delta); (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx", (longlong_t)group); userquota_update_cache(&cache->uqc_group_deltas, name, delta); if (dmu_objset_projectquota_enabled(os)) { (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx", (longlong_t)project); userquota_update_cache(&cache->uqc_project_deltas, name, delta); } } } typedef struct userquota_updates_arg { objset_t *uua_os; int uua_sublist_idx; dmu_tx_t *uua_tx; } userquota_updates_arg_t; static void userquota_updates_task(void *arg) { userquota_updates_arg_t *uua = arg; objset_t *os = uua->uua_os; dmu_tx_t *tx = uua->uua_tx; dnode_t *dn; userquota_cache_t cache = { { 0 } }; multilist_sublist_t *list = multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); ASSERT(multilist_sublist_head(list) == NULL || dmu_objset_userused_enabled(os)); avl_create(&cache.uqc_user_deltas, userquota_compare, sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); avl_create(&cache.uqc_group_deltas, userquota_compare, sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); if (dmu_objset_projectquota_enabled(os)) avl_create(&cache.uqc_project_deltas, userquota_compare, sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); while ((dn = multilist_sublist_head(list)) != NULL) { int flags; ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED); flags = dn->dn_id_flags; ASSERT(flags); if (flags & DN_ID_OLD_EXIST) { do_userquota_update(os, &cache, dn->dn_oldused, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, dn->dn_oldprojid, B_TRUE); do_userobjquota_update(os, &cache, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, dn->dn_oldprojid, B_TRUE); } if (flags & DN_ID_NEW_EXIST) { do_userquota_update(os, &cache, DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid, dn->dn_newprojid, B_FALSE); do_userobjquota_update(os, &cache, dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid, dn->dn_newprojid, B_FALSE); } mutex_enter(&dn->dn_mtx); dn->dn_oldused = 0; dn->dn_oldflags = 0; if (dn->dn_id_flags & DN_ID_NEW_EXIST) { dn->dn_olduid = dn->dn_newuid; dn->dn_oldgid = dn->dn_newgid; dn->dn_oldprojid = dn->dn_newprojid; dn->dn_id_flags |= DN_ID_OLD_EXIST; if (dn->dn_bonuslen == 0) dn->dn_id_flags |= DN_ID_CHKED_SPILL; else dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); dnode_rele(dn, &os->os_synced_dnodes); } do_userquota_cacheflush(os, &cache, tx); multilist_sublist_unlock(list); kmem_free(uua, sizeof (*uua)); } /* * Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be * evicted because the block containing the dnode can't be evicted until it is * written out. However, this hold is necessary to prevent the dnode_t from * being moved (via dnode_move()) while it's still referenced by * dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for * dirty_lightweight_leaf-type dirty records. * * If we are doing user-object accounting, the dnode_rele() happens from * userquota_updates_task() instead. */ static void dnode_rele_task(void *arg) { userquota_updates_arg_t *uua = arg; objset_t *os = uua->uua_os; multilist_sublist_t *list = multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { multilist_sublist_remove(list, dn); dnode_rele(dn, &os->os_synced_dnodes); } multilist_sublist_unlock(list); kmem_free(uua, sizeof (*uua)); } /* * Return TRUE if userquota updates are needed. */ static boolean_t dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx) { if (!dmu_objset_userused_enabled(os)) return (B_FALSE); /* * If this is a raw receive just return and handle accounting * later when we have the keys loaded. We also don't do user * accounting during claiming since the datasets are not owned * for the duration of claiming and this txg should only be * used for recovery. */ if (os->os_encrypted && dmu_objset_is_receiving(os)) return (B_FALSE); if (tx->tx_txg <= os->os_spa->spa_claim_max_txg) return (B_FALSE); /* Allocate the user/group/project used objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY0(zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); VERIFY0(zap_create_claim(os, DMU_GROUPUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } if (dmu_objset_projectquota_enabled(os) && DMU_PROJECTUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } return (B_TRUE); } /* * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and * also release the holds on the dnodes from dmu_objset_sync_dnodes(). * The caller must taskq_wait(dp_sync_taskq). */ void dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx) { boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx); int num_sublists = multilist_get_num_sublists(&os->os_synced_dnodes); for (int i = 0; i < num_sublists; i++) { userquota_updates_arg_t *uua = kmem_alloc(sizeof (*uua), KM_SLEEP); uua->uua_os = os; uua->uua_sublist_idx = i; uua->uua_tx = tx; /* * If we don't need to update userquotas, use * dnode_rele_task() to call dnode_rele() */ (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, need_userquota ? userquota_updates_task : dnode_rele_task, uua, 0); /* callback frees uua */ } } /* * Returns a pointer to data to find uid/gid from * * If a dirty record for transaction group that is syncing can't * be found then NULL is returned. In the NULL case it is assumed * the uid/gid aren't changing. */ static void * dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; void *data; if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ dr = dbuf_find_dirty_eq(db, tx->tx_txg); if (dr == NULL) { data = NULL; } else { if (dr->dr_dnode->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; } return (data); } void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; void *data = NULL; dmu_buf_impl_t *db = NULL; int flags = dn->dn_id_flags; int error; boolean_t have_spill = B_FALSE; if (!dmu_objset_userused_enabled(dn->dn_objset)) return; /* * Raw receives introduce a problem with user accounting. Raw * receives cannot update the user accounting info because the * user ids and the sizes are encrypted. To guarantee that we * never end up with bad user accounting, we simply disable it * during raw receives. We also disable this for normal receives * so that an incremental raw receive may be done on top of an * existing non-raw receive. */ if (os->os_encrypted && dmu_objset_is_receiving(os)) return; if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| DN_ID_CHKED_SPILL))) return; if (before && dn->dn_bonuslen != 0) data = DN_BONUS(dn->dn_phys); else if (!before && dn->dn_bonuslen != 0) { if (dn->dn_bonus) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); data = dmu_objset_userquota_find_data(db, tx); } else { data = DN_BONUS(dn->dn_phys); } } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { int rf = 0; if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; error = dmu_spill_hold_by_dnode(dn, rf | DB_RF_MUST_SUCCEED, FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); data = (before) ? db->db.db_data : dmu_objset_userquota_find_data(db, tx); have_spill = B_TRUE; } else { mutex_enter(&dn->dn_mtx); dn->dn_id_flags |= DN_ID_CHKED_BONUS; mutex_exit(&dn->dn_mtx); return; } /* * Must always call the callback in case the object * type has changed and that type isn't an object type to track */ zfs_file_info_t zfi; error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi); if (before) { ASSERT(data); dn->dn_olduid = zfi.zfi_user; dn->dn_oldgid = zfi.zfi_group; dn->dn_oldprojid = zfi.zfi_project; } else if (data) { dn->dn_newuid = zfi.zfi_user; dn->dn_newgid = zfi.zfi_group; dn->dn_newprojid = zfi.zfi_project; } /* * Preserve existing uid/gid when the callback can't determine * what the new uid/gid are and the callback returned EEXIST. * The EEXIST error tells us to just use the existing uid/gid. * If we don't know what the old values are then just assign * them to 0, since that is a new file being created. */ if (!before && data == NULL && error == EEXIST) { if (flags & DN_ID_OLD_EXIST) { dn->dn_newuid = dn->dn_olduid; dn->dn_newgid = dn->dn_oldgid; dn->dn_newprojid = dn->dn_oldprojid; } else { dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_newprojid = ZFS_DEFAULT_PROJID; } error = 0; } if (db) mutex_exit(&db->db_mtx); mutex_enter(&dn->dn_mtx); if (error == 0 && before) dn->dn_id_flags |= DN_ID_OLD_EXIST; if (error == 0 && !before) dn->dn_id_flags |= DN_ID_NEW_EXIST; if (have_spill) { dn->dn_id_flags |= DN_ID_CHKED_SPILL; } else { dn->dn_id_flags |= DN_ID_CHKED_BONUS; } mutex_exit(&dn->dn_mtx); if (have_spill) dmu_buf_rele((dmu_buf_t *)db, FTAG); } boolean_t dmu_objset_userspace_present(objset_t *os) { return (os->os_phys->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); } boolean_t dmu_objset_userobjspace_present(objset_t *os) { return (os->os_phys->os_flags & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE); } boolean_t dmu_objset_projectquota_present(objset_t *os) { return (os->os_phys->os_flags & OBJSET_FLAG_PROJECTQUOTA_COMPLETE); } static int dmu_objset_space_upgrade(objset_t *os) { uint64_t obj; int err = 0; /* * We simply need to mark every object dirty, so that it will be * synced out and now accounted. If this is called * concurrently, or if we already did some work before crashing, * that's fine, since we track each object's accounted state * independently. */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { dmu_tx_t *tx; dmu_buf_t *db; int objerr; mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_exit) err = SET_ERROR(EINTR); mutex_exit(&os->os_upgrade_lock); if (err != 0) return (err); if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); objerr = dmu_bonus_hold(os, obj, FTAG, &db); if (objerr != 0) continue; tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); if (objerr != 0) { dmu_buf_rele(db, FTAG); dmu_tx_abort(tx); continue; } dmu_buf_will_dirty(db, tx); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); } return (0); } static int dmu_objset_userspace_upgrade_cb(objset_t *os) { int err = 0; if (dmu_objset_userspace_present(os)) return (0); if (dmu_objset_is_snapshot(os)) return (SET_ERROR(EINVAL)); if (!dmu_objset_userused_enabled(os)) return (SET_ERROR(ENOTSUP)); err = dmu_objset_space_upgrade(os); if (err) return (err); os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; txg_wait_synced(dmu_objset_pool(os), 0); return (0); } void dmu_objset_userspace_upgrade(objset_t *os) { dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb); } static int dmu_objset_id_quota_upgrade_cb(objset_t *os) { int err = 0; if (dmu_objset_userobjspace_present(os) && dmu_objset_projectquota_present(os)) return (0); if (dmu_objset_is_snapshot(os)) return (SET_ERROR(EINVAL)); if (!dmu_objset_userused_enabled(os)) return (SET_ERROR(ENOTSUP)); if (!dmu_objset_projectquota_enabled(os) && dmu_objset_userobjspace_present(os)) return (SET_ERROR(ENOTSUP)); if (dmu_objset_userobjused_enabled(os)) dmu_objset_ds(os)->ds_feature_activation[ SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; if (dmu_objset_projectquota_enabled(os)) dmu_objset_ds(os)->ds_feature_activation[ SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; err = dmu_objset_space_upgrade(os); if (err) return (err); os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; if (dmu_objset_userobjused_enabled(os)) os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; if (dmu_objset_projectquota_enabled(os)) os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE; txg_wait_synced(dmu_objset_pool(os), 0); return (0); } void dmu_objset_id_quota_upgrade(objset_t *os) { dmu_objset_upgrade(os, dmu_objset_id_quota_upgrade_cb); } boolean_t dmu_objset_userobjspace_upgradable(objset_t *os) { return (dmu_objset_type(os) == DMU_OST_ZFS && !dmu_objset_is_snapshot(os) && dmu_objset_userobjused_enabled(os) && !dmu_objset_userobjspace_present(os) && spa_writeable(dmu_objset_spa(os))); } boolean_t dmu_objset_projectquota_upgradable(objset_t *os) { return (dmu_objset_type(os) == DMU_OST_ZFS && !dmu_objset_is_snapshot(os) && dmu_objset_projectquota_enabled(os) && !dmu_objset_projectquota_present(os) && spa_writeable(dmu_objset_spa(os))); } void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, usedobjsp, availobjsp); } uint64_t dmu_objset_fsid_guid(objset_t *os) { return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); } void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) { stat->dds_type = os->os_phys->os_type; if (os->os_dsl_dataset) dsl_dataset_fast_stat(os->os_dsl_dataset, stat); } void dmu_objset_stats(objset_t *os, nvlist_t *nv) { ASSERT(os->os_dsl_dataset || os->os_phys->os_type == DMU_OST_META); if (os->os_dsl_dataset != NULL) dsl_dataset_stats(os->os_dsl_dataset, nv); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, os->os_phys->os_type); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, dmu_objset_userspace_present(os)); } int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) return (os->os_dsl_dataset->ds_is_snapshot); else return (B_FALSE); } int dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen, boolean_t *conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_NORMALIZE, real, maxlen, conflict)); } int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; zap_cursor_t cursor; zap_attribute_t attr; ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strlcpy(name, attr.za_name, namelen); if (idp) *idp = attr.za_first_integer; if (case_conflict) *case_conflict = attr.za_normalization_conflict; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value) { return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value)); } int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp) { dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; zap_cursor_t cursor; zap_attribute_t attr; /* there is no next dir on a snapshot! */ if (os->os_dsl_dataset->ds_object != dsl_dir_phys(dd)->dd_head_dataset_obj) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strlcpy(name, attr.za_name, namelen); if (idp) *idp = attr.za_first_integer; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } typedef struct dmu_objset_find_ctx { taskq_t *dc_tq; dsl_pool_t *dc_dp; uint64_t dc_ddobj; char *dc_ddname; /* last component of ddobj's name */ int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *); void *dc_arg; int dc_flags; kmutex_t *dc_error_lock; int *dc_error; } dmu_objset_find_ctx_t; static void dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp) { dsl_pool_t *dp = dcp->dc_dp; dsl_dir_t *dd; dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; uint64_t thisobj; int err = 0; /* don't process if there already was an error */ if (*dcp->dc_error != 0) goto out; /* * Note: passing the name (dc_ddname) here is optional, but it * improves performance because we don't need to call * zap_value_search() to determine the name. */ err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd); if (err != 0) goto out; /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); goto out; } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (dcp->dc_flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); dmu_objset_find_ctx_t *child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP); *child_dcp = *dcp; child_dcp->dc_ddobj = attr->za_first_integer; child_dcp->dc_ddname = spa_strdup(attr->za_name); if (dcp->dc_tq != NULL) (void) taskq_dispatch(dcp->dc_tq, dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP); else dmu_objset_find_dp_impl(child_dcp); } zap_cursor_fini(&zc); } /* * Iterate over all snapshots. */ if (dcp->dc_flags & DS_FIND_SNAPSHOTS) { dsl_dataset_t *ds; err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); err = dsl_dataset_hold_obj(dp, attr->za_first_integer, FTAG, &ds); if (err != 0) break; err = dcp->dc_func(dp, ds, dcp->dc_arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; } zap_cursor_fini(&zc); } } kmem_free(attr, sizeof (zap_attribute_t)); if (err != 0) { dsl_dir_rele(dd, FTAG); goto out; } /* * Apply to self. */ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); /* * Note: we hold the dir while calling dsl_dataset_hold_obj() so * that the dir will remain cached, and we won't have to re-instantiate * it (which could be expensive due to finding its name via * zap_value_search()). */ dsl_dir_rele(dd, FTAG); if (err != 0) goto out; err = dcp->dc_func(dp, ds, dcp->dc_arg); dsl_dataset_rele(ds, FTAG); out: if (err != 0) { mutex_enter(dcp->dc_error_lock); /* only keep first error */ if (*dcp->dc_error == 0) *dcp->dc_error = err; mutex_exit(dcp->dc_error_lock); } if (dcp->dc_ddname != NULL) spa_strfree(dcp->dc_ddname); kmem_free(dcp, sizeof (*dcp)); } static void dmu_objset_find_dp_cb(void *arg) { dmu_objset_find_ctx_t *dcp = arg; dsl_pool_t *dp = dcp->dc_dp; /* * We need to get a pool_config_lock here, as there are several * assert(pool_config_held) down the stack. Getting a lock via * dsl_pool_config_enter is risky, as it might be stalled by a * pending writer. This would deadlock, as the write lock can * only be granted when our parent thread gives up the lock. * The _prio interface gives us priority over a pending writer. */ dsl_pool_config_enter_prio(dp, FTAG); dmu_objset_find_dp_impl(dcp); dsl_pool_config_exit(dp, FTAG); } /* * Find objsets under and including ddobj, call func(ds) on each. * The order for the enumeration is completely undefined. * func is called with dsl_pool_config held. */ int dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) { int error = 0; taskq_t *tq = NULL; int ntasks; dmu_objset_find_ctx_t *dcp; kmutex_t err_lock; mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL); dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP); dcp->dc_tq = NULL; dcp->dc_dp = dp; dcp->dc_ddobj = ddobj; dcp->dc_ddname = NULL; dcp->dc_func = func; dcp->dc_arg = arg; dcp->dc_flags = flags; dcp->dc_error_lock = &err_lock; dcp->dc_error = &error; if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) { /* * In case a write lock is held we can't make use of * parallelism, as down the stack of the worker threads * the lock is asserted via dsl_pool_config_held. * In case of a read lock this is solved by getting a read * lock in each worker thread, which isn't possible in case * of a writer lock. So we fall back to the synchronous path * here. * In the future it might be possible to get some magic into * dsl_pool_config_held in a way that it returns true for * the worker threads so that a single lock held from this * thread suffices. For now, stay single threaded. */ dmu_objset_find_dp_impl(dcp); mutex_destroy(&err_lock); return (error); } ntasks = dmu_find_threads; if (ntasks == 0) ntasks = vdev_count_leaves(dp->dp_spa) * 4; tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks, INT_MAX, 0); if (tq == NULL) { kmem_free(dcp, sizeof (*dcp)); mutex_destroy(&err_lock); return (SET_ERROR(ENOMEM)); } dcp->dc_tq = tq; /* dcp will be freed by task */ (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP); /* * PORTING: this code relies on the property of taskq_wait to wait * until no more tasks are queued and no more tasks are active. As * we always queue new tasks from within other tasks, task_wait * reliably waits for the full recursion to finish, even though we * enqueue new tasks after taskq_wait has been called. * On platforms other than illumos, taskq_wait may not have this * property. */ taskq_wait(tq); taskq_destroy(tq); mutex_destroy(&err_lock); return (error); } /* * Find all objsets under name, and for each, call 'func(child_name, arg)'. * The dp_config_rwlock must not be held when this is called, and it * will not be held when the callback is called. * Therefore this function should only be used when the pool is not changing * (e.g. in syncing context), or the callback can deal with the possible races. */ static int dmu_objset_find_impl(spa_t *spa, const char *name, int func(const char *, void *), void *arg, int flags) { dsl_dir_t *dd; dsl_pool_t *dp = spa_get_dsl(spa); dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; char *child; uint64_t thisobj; int err; dsl_pool_config_enter(dp, FTAG); err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); if (err != 0) { dsl_pool_config_exit(dp, FTAG); return (err); } /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); return (0); } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s/%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = dmu_objset_find_impl(spa, child, func, arg, flags); dsl_pool_config_enter(dp, FTAG); kmem_strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); if (err != 0) { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } } /* * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s@%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = func(child, arg); dsl_pool_config_enter(dp, FTAG); kmem_strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); } } dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); dsl_pool_config_exit(dp, FTAG); if (err != 0) return (err); /* Apply to self. */ return (func(name, arg)); } /* * See comment above dmu_objset_find_impl(). */ int dmu_objset_find(const char *name, int func(const char *, void *), void *arg, int flags) { spa_t *spa; int error; error = spa_open(name, &spa, FTAG); if (error != 0) return (error); error = dmu_objset_find_impl(spa, name, func, arg, flags); spa_close(spa, FTAG); return (error); } boolean_t dmu_objset_incompatible_encryption_version(objset_t *os) { return (dsl_dir_incompatible_encryption_version( os->os_dsl_dataset->ds_dir)); } void dmu_objset_set_user(objset_t *os, void *user_ptr) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); os->os_user_ptr = user_ptr; } void * dmu_objset_get_user(objset_t *os) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); return (os->os_user_ptr); } /* * Determine name of filesystem, given name of snapshot. * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ int dmu_fsname(const char *snapname, char *buf) { char *atp = strchr(snapname, '@'); if (atp == NULL) return (SET_ERROR(EINVAL)); if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(buf, snapname, atp - snapname + 1); return (0); } /* * Call when we think we're going to write/free space in open context * to track the amount of dirty data in the open txg, which is also the * amount of memory that can not be evicted until this txg syncs. * * Note that there are two conditions where this can be called from * syncing context: * * [1] When we just created the dataset, in which case we go on with * updating any accounting of dirty data as usual. * [2] When we are dirtying MOS data, in which case we only update the * pool's accounting of dirty data. */ void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) { dsl_dataset_t *ds = os->os_dsl_dataset; int64_t aspace = spa_get_worst_case_asize(os->os_spa, space); if (ds != NULL) { dsl_dir_willuse_space(ds->ds_dir, aspace, tx); } dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } #if defined(_KERNEL) EXPORT_SYMBOL(dmu_objset_zil); EXPORT_SYMBOL(dmu_objset_pool); EXPORT_SYMBOL(dmu_objset_ds); EXPORT_SYMBOL(dmu_objset_type); EXPORT_SYMBOL(dmu_objset_name); EXPORT_SYMBOL(dmu_objset_hold); EXPORT_SYMBOL(dmu_objset_hold_flags); EXPORT_SYMBOL(dmu_objset_own); EXPORT_SYMBOL(dmu_objset_rele); EXPORT_SYMBOL(dmu_objset_rele_flags); EXPORT_SYMBOL(dmu_objset_disown); EXPORT_SYMBOL(dmu_objset_from_ds); EXPORT_SYMBOL(dmu_objset_create); EXPORT_SYMBOL(dmu_objset_clone); EXPORT_SYMBOL(dmu_objset_stats); EXPORT_SYMBOL(dmu_objset_fast_stat); EXPORT_SYMBOL(dmu_objset_spa); EXPORT_SYMBOL(dmu_objset_space); EXPORT_SYMBOL(dmu_objset_fsid_guid); EXPORT_SYMBOL(dmu_objset_find); EXPORT_SYMBOL(dmu_objset_byteswap); EXPORT_SYMBOL(dmu_objset_evict_dbufs); EXPORT_SYMBOL(dmu_objset_snap_cmtime); EXPORT_SYMBOL(dmu_objset_dnodesize); EXPORT_SYMBOL(dmu_objset_sync); EXPORT_SYMBOL(dmu_objset_is_dirty); EXPORT_SYMBOL(dmu_objset_create_impl_dnstats); EXPORT_SYMBOL(dmu_objset_create_impl); EXPORT_SYMBOL(dmu_objset_open_impl); EXPORT_SYMBOL(dmu_objset_evict); EXPORT_SYMBOL(dmu_objset_register_type); EXPORT_SYMBOL(dmu_objset_sync_done); EXPORT_SYMBOL(dmu_objset_userquota_get_ids); EXPORT_SYMBOL(dmu_objset_userused_enabled); EXPORT_SYMBOL(dmu_objset_userspace_upgrade); EXPORT_SYMBOL(dmu_objset_userspace_present); EXPORT_SYMBOL(dmu_objset_userobjused_enabled); EXPORT_SYMBOL(dmu_objset_userobjspace_upgradable); EXPORT_SYMBOL(dmu_objset_userobjspace_present); EXPORT_SYMBOL(dmu_objset_projectquota_enabled); EXPORT_SYMBOL(dmu_objset_projectquota_present); EXPORT_SYMBOL(dmu_objset_projectquota_upgradable); EXPORT_SYMBOL(dmu_objset_id_quota_upgrade); #endif diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c index 601d27f8c47a..718bbb34a8d5 100644 --- a/sys/contrib/openzfs/module/zfs/refcount.c +++ b/sys/contrib/openzfs/module/zfs/refcount.c @@ -1,350 +1,359 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2021 by Delphix. All rights reserved. */ #include #include #ifdef ZFS_DEBUG /* * Reference count tracking is disabled by default. It's memory requirements * are reasonable, however as implemented it consumes a significant amount of * cpu time. Until its performance is improved it should be manually enabled. */ int reference_tracking_enable = B_FALSE; static uint_t reference_history = 3; /* tunable */ static kmem_cache_t *reference_cache; -static kmem_cache_t *reference_history_cache; void zfs_refcount_init(void) { reference_cache = kmem_cache_create("reference_cache", sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - - reference_history_cache = kmem_cache_create("reference_history_cache", - sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zfs_refcount_fini(void) { kmem_cache_destroy(reference_cache); - kmem_cache_destroy(reference_history_cache); +} + +static int +zfs_refcount_compare(const void *x1, const void *x2) +{ + const reference_t *r1 = (const reference_t *)x1; + const reference_t *r2 = (const reference_t *)x2; + + int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder); + int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number); + int cmp = cmp1 ? cmp1 : cmp2; + return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2)); } void zfs_refcount_create(zfs_refcount_t *rc) { mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&rc->rc_list, sizeof (reference_t), - offsetof(reference_t, ref_link)); + avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t), + offsetof(reference_t, ref_link.a)); list_create(&rc->rc_removed, sizeof (reference_t), - offsetof(reference_t, ref_link)); + offsetof(reference_t, ref_link.l)); rc->rc_count = 0; rc->rc_removed_count = 0; rc->rc_tracked = reference_tracking_enable; } void zfs_refcount_create_tracked(zfs_refcount_t *rc) { zfs_refcount_create(rc); rc->rc_tracked = B_TRUE; } void zfs_refcount_create_untracked(zfs_refcount_t *rc) { zfs_refcount_create(rc); rc->rc_tracked = B_FALSE; } void zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) { reference_t *ref; + void *cookie = NULL; ASSERT3U(rc->rc_count, ==, number); - while ((ref = list_remove_head(&rc->rc_list))) + while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL) kmem_cache_free(reference_cache, ref); - list_destroy(&rc->rc_list); + avl_destroy(&rc->rc_tree); - while ((ref = list_remove_head(&rc->rc_removed))) { - kmem_cache_free(reference_history_cache, ref->ref_removed); + while ((ref = list_remove_head(&rc->rc_removed))) kmem_cache_free(reference_cache, ref); - } list_destroy(&rc->rc_removed); mutex_destroy(&rc->rc_mtx); } void zfs_refcount_destroy(zfs_refcount_t *rc) { zfs_refcount_destroy_many(rc, 0); } int zfs_refcount_is_zero(zfs_refcount_t *rc) { return (zfs_refcount_count(rc) == 0); } int64_t zfs_refcount_count(zfs_refcount_t *rc) { return (atomic_load_64(&rc->rc_count)); } int64_t zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { - reference_t *ref = NULL; + reference_t *ref; int64_t count; - if (!rc->rc_tracked) { + if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, number); ASSERT3U(count, >=, number); return (count); } ref = kmem_cache_alloc(reference_cache, KM_SLEEP); ref->ref_holder = holder; ref->ref_number = number; + ref->ref_search = B_FALSE; mutex_enter(&rc->rc_mtx); - list_insert_head(&rc->rc_list, ref); + avl_add(&rc->rc_tree, ref); rc->rc_count += number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); return (count); } int64_t zfs_refcount_add(zfs_refcount_t *rc, const void *holder) { return (zfs_refcount_add_many(rc, 1, holder)); } void zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder) { - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) (void) zfs_refcount_add_many(rc, number, holder); else for (; number > 0; number--) (void) zfs_refcount_add(rc, holder); } int64_t zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { - reference_t *ref; + reference_t *ref, s; int64_t count; - if (!rc->rc_tracked) { + if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, -number); ASSERT3S(count, >=, 0); return (count); } + s.ref_holder = holder; + s.ref_number = number; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); ASSERT3U(rc->rc_count, >=, number); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder && ref->ref_number == number) { - list_remove(&rc->rc_list, ref); - if (reference_history > 0) { - ref->ref_removed = - kmem_cache_alloc(reference_history_cache, - KM_SLEEP); - list_insert_head(&rc->rc_removed, ref); - rc->rc_removed_count++; - if (rc->rc_removed_count > reference_history) { - ref = list_tail(&rc->rc_removed); - list_remove(&rc->rc_removed, ref); - kmem_cache_free(reference_history_cache, - ref->ref_removed); - kmem_cache_free(reference_cache, ref); - rc->rc_removed_count--; - } - } else { - kmem_cache_free(reference_cache, ref); - } - rc->rc_count -= number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); - return (count); + ref = avl_find(&rc->rc_tree, &s, NULL); + if (unlikely(ref == NULL)) { + panic("No such hold %p on refcount %llx", holder, + (u_longlong_t)(uintptr_t)rc); + return (-1); + } + avl_remove(&rc->rc_tree, ref); + if (reference_history > 0) { + list_insert_head(&rc->rc_removed, ref); + if (rc->rc_removed_count >= reference_history) { + ref = list_remove_tail(&rc->rc_removed); + kmem_cache_free(reference_cache, ref); + } else { + rc->rc_removed_count++; } + } else { + kmem_cache_free(reference_cache, ref); } - panic("No such hold %p on refcount %llx", holder, - (u_longlong_t)(uintptr_t)rc); - return (-1); + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); } int64_t zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) { return (zfs_refcount_remove_many(rc, 1, holder)); } void zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) { - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) (void) zfs_refcount_remove_many(rc, number, holder); else for (; number > 0; number--) (void) zfs_refcount_remove(rc, holder); } void zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) { - int64_t count, removed_count; - list_t list, removed; + avl_tree_t tree; + list_t removed; + reference_t *ref; + void *cookie = NULL; + uint64_t count; + uint_t removed_count; - list_create(&list, sizeof (reference_t), - offsetof(reference_t, ref_link)); + avl_create(&tree, zfs_refcount_compare, sizeof (reference_t), + offsetof(reference_t, ref_link.a)); list_create(&removed, sizeof (reference_t), - offsetof(reference_t, ref_link)); + offsetof(reference_t, ref_link.l)); mutex_enter(&src->rc_mtx); count = src->rc_count; removed_count = src->rc_removed_count; src->rc_count = 0; src->rc_removed_count = 0; - list_move_tail(&list, &src->rc_list); + avl_swap(&tree, &src->rc_tree); list_move_tail(&removed, &src->rc_removed); mutex_exit(&src->rc_mtx); mutex_enter(&dst->rc_mtx); dst->rc_count += count; dst->rc_removed_count += removed_count; - list_move_tail(&dst->rc_list, &list); + if (avl_is_empty(&dst->rc_tree)) + avl_swap(&dst->rc_tree, &tree); + else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL) + avl_add(&dst->rc_tree, ref); list_move_tail(&dst->rc_removed, &removed); mutex_exit(&dst->rc_mtx); - list_destroy(&list); + avl_destroy(&tree); list_destroy(&removed); } void zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, const void *current_holder, const void *new_holder) { - reference_t *ref; - boolean_t found = B_FALSE; + reference_t *ref, s; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return; + s.ref_holder = current_holder; + s.ref_number = number; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == current_holder && - ref->ref_number == number) { - ref->ref_holder = new_holder; - found = B_TRUE; - break; - } - } - ASSERT(found); + ref = avl_find(&rc->rc_tree, &s, NULL); + ASSERT(ref); + ref->ref_holder = new_holder; + avl_update(&rc->rc_tree, ref); mutex_exit(&rc->rc_mtx); } void zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder, const void *new_holder) { return (zfs_refcount_transfer_ownership_many(rc, 1, current_holder, new_holder)); } /* * If tracking is enabled, return true if a reference exists that matches * the "holder" tag. If tracking is disabled, then return true if a reference * might be held. */ boolean_t zfs_refcount_held(zfs_refcount_t *rc, const void *holder) { - reference_t *ref; + reference_t *ref, s; + avl_index_t idx; + boolean_t res; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return (zfs_refcount_count(rc) > 0); + s.ref_holder = holder; + s.ref_number = 0; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder) { - mutex_exit(&rc->rc_mtx); - return (B_TRUE); - } - } + ref = avl_find(&rc->rc_tree, &s, &idx); + if (likely(ref == NULL)) + ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); + res = ref && ref->ref_holder == holder; mutex_exit(&rc->rc_mtx); - return (B_FALSE); + return (res); } /* * If tracking is enabled, return true if a reference does not exist that * matches the "holder" tag. If tracking is disabled, always return true * since the reference might not be held. */ boolean_t zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) { - reference_t *ref; + reference_t *ref, s; + avl_index_t idx; + boolean_t res; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return (B_TRUE); mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder) { - mutex_exit(&rc->rc_mtx); - return (B_FALSE); - } - } + s.ref_holder = holder; + s.ref_number = 0; + s.ref_search = B_TRUE; + ref = avl_find(&rc->rc_tree, &s, &idx); + if (likely(ref == NULL)) + ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); + res = ref == NULL || ref->ref_holder != holder; mutex_exit(&rc->rc_mtx); - return (B_TRUE); + return (res); } EXPORT_SYMBOL(zfs_refcount_create); EXPORT_SYMBOL(zfs_refcount_destroy); EXPORT_SYMBOL(zfs_refcount_is_zero); EXPORT_SYMBOL(zfs_refcount_count); EXPORT_SYMBOL(zfs_refcount_add); EXPORT_SYMBOL(zfs_refcount_remove); EXPORT_SYMBOL(zfs_refcount_held); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, , reference_tracking_enable, INT, ZMOD_RW, "Track reference holders to refcount_t objects"); ZFS_MODULE_PARAM(zfs, , reference_history, UINT, ZMOD_RW, "Maximum reference holders being tracked"); /* END CSTYLED */ #endif /* ZFS_DEBUG */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index 85c7134ca4c4..a5c76808f2d2 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -1,2028 +1,2031 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ /* * Virtual Device Labels * --------------------- * * The vdev label serves several distinct purposes: * * 1. Uniquely identify this device as part of a ZFS pool and confirm its * identity within the pool. * * 2. Verify that all the devices given in a configuration are present * within the pool. * * 3. Determine the uberblock for the pool. * * 4. In case of an import operation, determine the configuration of the * toplevel vdev of which it is a part. * * 5. If an import operation cannot find all the devices in the pool, * provide enough information to the administrator to determine which * devices are missing. * * It is important to note that while the kernel is responsible for writing the * label, it only consumes the information in the first three cases. The * latter information is only consumed in userland when determining the * configuration to import a pool. * * * Label Organization * ------------------ * * Before describing the contents of the label, it's important to understand how * the labels are written and updated with respect to the uberblock. * * When the pool configuration is altered, either because it was newly created * or a device was added, we want to update all the labels such that we can deal * with fatal failure at any point. To this end, each disk has two labels which * are updated before and after the uberblock is synced. Assuming we have * labels and an uberblock with the following transaction groups: * * L1 UB L2 * +------+ +------+ +------+ * | | | | | | * | t10 | | t10 | | t10 | * | | | | | | * +------+ +------+ +------+ * * In this stable state, the labels and the uberblock were all updated within * the same transaction group (10). Each label is mirrored and checksummed, so * that we can detect when we fail partway through writing the label. * * In order to identify which labels are valid, the labels are written in the * following manner: * * 1. For each vdev, update 'L1' to the new label * 2. Update the uberblock * 3. For each vdev, update 'L2' to the new label * * Given arbitrary failure, we can determine the correct label to use based on * the transaction group. If we fail after updating L1 but before updating the * UB, we will notice that L1's transaction group is greater than the uberblock, * so L2 must be valid. If we fail after writing the uberblock but before * writing L2, we will notice that L2's transaction group is less than L1, and * therefore L1 is valid. * * Another added complexity is that not every label is updated when the config * is synced. If we add a single device, we do not want to have to re-write * every label for every device in the pool. This means that both L1 and L2 may * be older than the pool uberblock, because the necessary information is stored * on another vdev. * * * On-disk Format * -------------- * * The vdev label consists of two distinct parts, and is wrapped within the * vdev_label_t structure. The label includes 8k of padding to permit legacy * VTOC disk labels, but is otherwise ignored. * * The first half of the label is a packed nvlist which contains pool wide * properties, per-vdev properties, and configuration information. It is * described in more detail below. * * The latter half of the label consists of a redundant array of uberblocks. * These uberblocks are updated whenever a transaction group is committed, * or when the configuration is updated. When a pool is loaded, we scan each * vdev for the 'best' uberblock. * * * Configuration Information * ------------------------- * * The nvlist describing the pool and vdev contains the following elements: * * version ZFS on-disk version * name Pool name * state Pool state * txg Transaction group in which this label was written * pool_guid Unique identifier for this pool * vdev_tree An nvlist describing vdev tree. * features_for_read * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * * top_guid Unique ID for top-level vdev in which this is contained * guid Unique ID for the leaf vdev * * The 'vs' configuration follows the format described in 'spa_config.c'. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Basic routines to read and write from a vdev label. * Used throughout the rest of this file. */ uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset) { ASSERT(offset < sizeof (vdev_label_t)); ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0); return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); } /* * Returns back the vdev label associated with the passed in offset. */ int vdev_label_number(uint64_t psize, uint64_t offset) { int l; if (offset >= psize - VDEV_LABEL_END_SIZE) { offset -= psize - VDEV_LABEL_END_SIZE; offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t); } l = offset / sizeof (vdev_label_t); return (l < VDEV_LABELS ? l : -1); } static void vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT( spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_read_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); } void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT( spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); zio_nowait(zio_write_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); } /* * Generate the nvlist representing this vdev's stats */ void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) { nvlist_t *nvx; vdev_stat_t *vs; vdev_stat_ex_t *vsx; vs = kmem_alloc(sizeof (*vs), KM_SLEEP); vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP); vdev_get_stats_ex(vd, vs, vsx); fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t)); /* * Add extended stats into a special extended stats nvlist. This keeps * all the extended stats nicely grouped together. The extended stats * nvlist is then added to the main nvlist. */ nvx = fnvlist_alloc(); /* ZIOs in flight to disk */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, vsx->vsx_active_queue[ZIO_PRIORITY_REBUILD]); /* ZIOs pending */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]); fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, vsx->vsx_pend_queue[ZIO_PRIORITY_REBUILD]); /* Histograms */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, vsx->vsx_total_histo[ZIO_TYPE_READ], ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, vsx->vsx_total_histo[ZIO_TYPE_WRITE], ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, vsx->vsx_disk_histo[ZIO_TYPE_READ], ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, vsx->vsx_disk_histo[ZIO_TYPE_WRITE], ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD], ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_REBUILD])); /* Request sizes */ fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO, vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD], ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_REBUILD])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM])); fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO, vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD], ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_REBUILD])); /* IO delays */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); /* Add extended stats nvlist to main nvlist */ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); fnvlist_free(nvx); kmem_free(vs, sizeof (*vs)); kmem_free(vsx, sizeof (*vsx)); } static void root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) { spa_t *spa = vd->vdev_spa; if (vd != spa->spa_root_vdev) return; /* provide either current or previous scan information */ pool_scan_stat_t ps; if (spa_scan_get_stats(spa, &ps) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, sizeof (pool_scan_stat_t) / sizeof (uint64_t)); } pool_removal_stat_t prs; if (spa_removal_get_stats(spa, &prs) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, sizeof (prs) / sizeof (uint64_t)); } pool_checkpoint_stat_t pcs; if (spa_checkpoint_get_stats(spa, &pcs) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, sizeof (pcs) / sizeof (uint64_t)); } } static void top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) { if (vd == vd->vdev_top) { vdev_rebuild_stat_t vrs; if (vdev_rebuild_get_stats(vd, &vrs) == 0) { fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs, sizeof (vrs) / sizeof (uint64_t)); } } } /* * Generate the nvlist representing this vdev's config. */ nvlist_t * vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags) { nvlist_t *nv = NULL; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; nv = fnvlist_alloc(); fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id); fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_path != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); if (vd->vdev_devid != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid); if (vd->vdev_physpath != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vd->vdev_physpath); if (vd->vdev_enc_sysfs_path != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path); if (vd->vdev_fru != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); if (vd->vdev_ops->vdev_op_config_generate != NULL) vd->vdev_ops->vdev_op_config_generate(vd, nv); if (vd->vdev_wholedisk != -1ULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); } if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); if (vd->vdev_isspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + if (flags & VDEV_CONFIG_L2CACHE) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && vd == vd->vdev_top) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, vd->vdev_ms_array); fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, vd->vdev_ms_shift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); if (vd->vdev_noalloc) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, vd->vdev_noalloc); } /* * Slog devices are removed synchronously so don't * persist the vdev_removing flag to the label. */ if (vd->vdev_removing && !vd->vdev_islog) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); } /* zpool command expects alloc class data */ if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { const char *bias = NULL; switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: bias = VDEV_ALLOC_BIAS_LOG; break; case VDEV_BIAS_SPECIAL: bias = VDEV_ALLOC_BIAS_SPECIAL; break; case VDEV_BIAS_DEDUP: bias = VDEV_ALLOC_BIAS_DEDUP; break; default: ASSERT3U(vd->vdev_alloc_bias, ==, VDEV_BIAS_NONE); } fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, bias); } } if (vd->vdev_dtl_sm != NULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, space_map_object(vd->vdev_dtl_sm)); } if (vic->vic_mapping_object != 0) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, vic->vic_mapping_object); } if (vic->vic_births_object != 0) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, vic->vic_births_object); } if (vic->vic_prev_indirect_vdev != UINT64_MAX) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, vic->vic_prev_indirect_vdev); } if (vd->vdev_crtxg) fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); if (vd->vdev_expansion_time) fnvlist_add_uint64(nv, ZPOOL_CONFIG_EXPANSION_TIME, vd->vdev_expansion_time); if (flags & VDEV_CONFIG_MOS) { if (vd->vdev_leaf_zap != 0) { ASSERT(vd->vdev_ops->vdev_op_leaf); fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, vd->vdev_leaf_zap); } if (vd->vdev_top_zap != 0) { ASSERT(vd == vd->vdev_top); fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, vd->vdev_top_zap); } if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap != 0 && spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, vd->vdev_root_zap); } if (vd->vdev_resilver_deferred) { ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(spa->spa_resilver_deferred); fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER); } } if (getstats) { vdev_config_generate_stats(vd, nv); root_vdev_actions_getprogress(vd, nv); top_vdev_actions_getprogress(vd, nv); /* * Note: this can be called from open context * (spa_get_stats()), so we need the rwlock to prevent * the mapping from being changed by condensing. */ rw_enter(&vd->vdev_indirect_rwlock, RW_READER); if (vd->vdev_indirect_mapping != NULL) { ASSERT(vd->vdev_indirect_births != NULL); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, vdev_indirect_mapping_size(vim)); } rw_exit(&vd->vdev_indirect_rwlock); if (vd->vdev_mg != NULL && vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) { /* * Compute approximately how much memory would be used * for the indirect mapping if this device were to * be removed. * * Note: If the frag metric is invalid, then not * enough metaslabs have been converted to have * histograms. */ uint64_t seg_count = 0; uint64_t to_alloc = vd->vdev_stat.vs_alloc; /* * There are the same number of allocated segments * as free segments, so we will have at least one * entry per free segment. However, small free * segments (smaller than vdev_removal_max_span) * will be combined with adjacent allocated segments * as a single mapping. */ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { if (i + 1 < highbit64(vdev_removal_max_span) - 1) { to_alloc += vd->vdev_mg->mg_histogram[i] << (i + 1); } else { seg_count += vd->vdev_mg->mg_histogram[i]; } } /* * The maximum length of a mapping is * zfs_remove_max_segment, so we need at least one entry * per zfs_remove_max_segment of allocated data. */ seg_count += to_alloc / spa_remove_max_segment(spa); fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, seg_count * sizeof (vdev_indirect_mapping_entry_phys_t)); } } if (!vd->vdev_ops->vdev_op_leaf) { nvlist_t **child; uint64_t c; ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); for (c = 0; c < vd->vdev_children; c++) { child[c] = vdev_config_generate(spa, vd->vdev_child[c], getstats, flags); } fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, (const nvlist_t * const *)child, vd->vdev_children); for (c = 0; c < vd->vdev_children; c++) nvlist_free(child[c]); kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); } else { const char *aux = NULL; if (vd->vdev_offline && !vd->vdev_tmpoffline) fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE); if (vd->vdev_resilver_txg != 0) fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, vd->vdev_resilver_txg); if (vd->vdev_rebuild_txg != 0) fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, vd->vdev_rebuild_txg); if (vd->vdev_faulted) fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); if (vd->vdev_degraded) fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE); if (vd->vdev_removed) fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE); if (vd->vdev_unspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE); if (vd->vdev_ishole) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE); /* Set the reason why we're FAULTED/DEGRADED. */ switch (vd->vdev_stat.vs_aux) { case VDEV_AUX_ERR_EXCEEDED: aux = "err_exceeded"; break; case VDEV_AUX_EXTERNAL: aux = "external"; break; } if (aux != NULL && !vd->vdev_tmpoffline) { fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); } else { /* * We're healthy - clear any previous AUX_STATE values. */ if (nvlist_exists(nv, ZPOOL_CONFIG_AUX_STATE)) nvlist_remove_all(nv, ZPOOL_CONFIG_AUX_STATE); } if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, vd->vdev_orig_guid); } } return (nv); } /* * Generate a view of the top-level vdevs. If we currently have holes * in the namespace, then generate an array which contains a list of holey * vdevs. Additionally, add the number of top-level children that currently * exist. */ void vdev_top_config_generate(spa_t *spa, nvlist_t *config) { vdev_t *rvd = spa->spa_root_vdev; uint64_t *array; uint_t c, idx; array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); for (c = 0, idx = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_ishole) { array[idx++] = c; } } if (idx) { VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, array, idx) == 0); } VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, rvd->vdev_children) == 0); kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); } /* * Returns the configuration from the label of the given vdev. For vdevs * which don't have a txg value stored on their label (i.e. spares/cache) * or have not been completely initialized (txg = 0) just return * the configuration from the first valid label we find. Otherwise, * find the most up-to-date label that does not exceed the specified * 'txg' value. */ nvlist_t * vdev_label_read_config(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp[VDEV_LABELS]; abd_t *vp_abd[VDEV_LABELS]; zio_t *zio[VDEV_LABELS]; uint64_t best_txg = 0; uint64_t label_txg = 0; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; ASSERT(vd->vdev_validate_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (!vdev_readable(vd)) return (NULL); /* * The label for a dRAID distributed spare is not stored on disk. * Instead it is generated when needed which allows us to bypass * the pipeline when reading the config from the label. */ if (vd->vdev_ops == &vdev_draid_spare_ops) return (vdev_draid_read_config_spare(vd)); for (int l = 0; l < VDEV_LABELS; l++) { vp_abd[l] = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); vp[l] = abd_to_buf(vp_abd[l]); } retry: for (int l = 0; l < VDEV_LABELS; l++) { zio[l] = zio_root(spa, NULL, NULL, flags); vdev_label_read(zio[l], vd, l, vp_abd[l], offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); } for (int l = 0; l < VDEV_LABELS; l++) { nvlist_t *label = NULL; if (zio_wait(zio[l]) == 0 && nvlist_unpack(vp[l]->vp_nvlist, sizeof (vp[l]->vp_nvlist), &label, 0) == 0) { /* * Auxiliary vdevs won't have txg values in their * labels and newly added vdevs may not have been * completely initialized so just return the * configuration from the first valid label we * encounter. */ error = nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &label_txg); if ((error || label_txg == 0) && !config) { config = label; for (l++; l < VDEV_LABELS; l++) zio_wait(zio[l]); break; } else if (label_txg <= txg && label_txg > best_txg) { best_txg = label_txg; nvlist_free(config); config = fnvlist_dup(label); } } if (label != NULL) { nvlist_free(label); label = NULL; } } if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; } /* * We found a valid label but it didn't pass txg restrictions. */ if (config == NULL && label_txg != 0) { vdev_dbgmsg(vd, "label discarded as txg is too large " "(%llu > %llu)", (u_longlong_t)label_txg, (u_longlong_t)txg); } for (int l = 0; l < VDEV_LABELS; l++) { abd_free(vp_abd[l]); } return (config); } /* * Determine if a device is in use. The 'spare_guid' parameter will be filled * in with the device guid if this spare is active elsewhere on the system. */ static boolean_t vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, uint64_t *spare_guid, uint64_t *l2cache_guid) { spa_t *spa = vd->vdev_spa; uint64_t state, pool_guid, device_guid, txg, spare_pool; uint64_t vdtxg = 0; nvlist_t *label; if (spare_guid) *spare_guid = 0ULL; if (l2cache_guid) *l2cache_guid = 0ULL; /* * Read the label, if any, and perform some basic sanity checks. */ if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) return (B_FALSE); (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, &vdtxg); if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &device_guid) != 0) { nvlist_free(label); return (B_FALSE); } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &txg) != 0)) { nvlist_free(label); return (B_FALSE); } nvlist_free(label); /* * Check to see if this device indeed belongs to the pool it claims to * be a part of. The only way this is allowed is if the device is a hot * spare (which we check for later on). */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && !spa_guid_exists(pool_guid, device_guid) && !spa_spare_exists(device_guid, NULL, NULL) && !spa_l2cache_exists(device_guid, NULL)) return (B_FALSE); /* * If the transaction group is zero, then this an initialized (but * unused) label. This is only an error if the create transaction * on-disk is the same as the one we're using now, in which case the * user has attempted to add the same vdev multiple times in the same * transaction. */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && txg == 0 && vdtxg == crtxg) return (B_TRUE); /* * Check to see if this is a spare device. We do an explicit check for * spa_has_spare() here because it may be on our pending list of spares * to add. */ if (spa_spare_exists(device_guid, &spare_pool, NULL) || spa_has_spare(spa, device_guid)) { if (spare_guid) *spare_guid = device_guid; switch (reason) { case VDEV_LABEL_CREATE: return (B_TRUE); case VDEV_LABEL_REPLACE: return (!spa_has_spare(spa, device_guid) || spare_pool != 0ULL); case VDEV_LABEL_SPARE: return (spa_has_spare(spa, device_guid)); default: break; } } /* * Check to see if this is an l2cache device. */ if (spa_l2cache_exists(device_guid, NULL) || spa_has_l2cache(spa, device_guid)) { if (l2cache_guid) *l2cache_guid = device_guid; switch (reason) { case VDEV_LABEL_CREATE: return (B_TRUE); case VDEV_LABEL_REPLACE: return (!spa_has_l2cache(spa, device_guid)); case VDEV_LABEL_L2CACHE: return (spa_has_l2cache(spa, device_guid)); default: break; } } /* * We can't rely on a pool's state if it's been imported * read-only. Instead we look to see if the pools is marked * read-only in the namespace and set the state to active. */ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (spa = spa_by_guid(pool_guid, device_guid)) != NULL && spa_mode(spa) == SPA_MODE_READ) state = POOL_STATE_ACTIVE; /* * If the device is marked ACTIVE, then this device is in use by another * pool on the system. */ return (state == POOL_STATE_ACTIVE); } /* * Initialize a vdev label. We check to make sure each leaf device is not in * use, and writable. We put down an initial label which we will later * overwrite with a complete label. Note that it's important to do this * sequentially, not in parallel, so that we catch cases of multiple use of the * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with * itself. */ int vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) { spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; abd_t *bootenv; uberblock_t *ub; abd_t *ub_abd; zio_t *zio; char *buf; size_t buflen; int error; uint64_t spare_guid = 0, l2cache_guid = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); for (int c = 0; c < vd->vdev_children; c++) if ((error = vdev_label_init(vd->vdev_child[c], crtxg, reason)) != 0) return (error); /* Track the creation time for this vdev */ vd->vdev_crtxg = crtxg; if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa)) return (0); /* * Dead vdevs cannot be initialized. */ if (vdev_is_dead(vd)) return (SET_ERROR(EIO)); /* * Determine if the vdev is in use. */ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) return (SET_ERROR(EBUSY)); /* * If this is a request to add or replace a spare or l2cache device * that is in use elsewhere on the system, then we must update the * guid (which was initialized to a random value) to reflect the * actual GUID (which is shared between multiple pools). */ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && spare_guid != 0ULL) { uint64_t guid_delta = spare_guid - vd->vdev_guid; vd->vdev_guid += guid_delta; for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding a spare, then it's already * labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_SPARE) return (0); ASSERT(reason == VDEV_LABEL_REPLACE || reason == VDEV_LABEL_SPLIT); } if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && l2cache_guid != 0ULL) { uint64_t guid_delta = l2cache_guid - vd->vdev_guid; vd->vdev_guid += guid_delta; for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += guid_delta; /* * If this is a replacement, then we want to fallthrough to the * rest of the code. If we're adding an l2cache, then it's * already labeled appropriately and we can just return. */ if (reason == VDEV_LABEL_L2CACHE) return (0); ASSERT(reason == VDEV_LABEL_REPLACE); } /* * Initialize its label. */ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); abd_zero(vp_abd, sizeof (vdev_phys_t)); vp = abd_to_buf(vp_abd); /* * Generate a label describing the pool and our top-level vdev. * We mark it as being from txg 0 to indicate that it's not * really part of an active pool just yet. The labels will * be written again with a meaningful txg by spa_sync(). */ if (reason == VDEV_LABEL_SPARE || (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { /* * For inactive hot spares, we generate a special label that * identifies as a mutually shared hot spare. We write the * label if we are adding a hot spare, or if we are removing an * active hot spare (in which case we want to revert the * labels). */ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_SPARE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } else if (reason == VDEV_LABEL_L2CACHE || (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { /* * For level 2 ARC devices, add a special label. */ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_L2CACHE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } else { uint64_t txg = 0ULL; if (reason == VDEV_LABEL_SPLIT) txg = spa->spa_uberblock.ub_txg; label = spa_config_generate(spa, vd, txg, B_FALSE); /* * Add our creation time. This allows us to detect multiple * vdev uses as described above, and automatically expires if we * fail. */ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, crtxg) == 0); } buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); abd_free(vp_abd); /* EFAULT means nvlist_pack ran out of room */ return (SET_ERROR(error == EFAULT ? ENAMETOOLONG : EINVAL)); } /* * Initialize uberblock template. */ ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); abd_zero(ub_abd, VDEV_UBERBLOCK_RING); abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); ub = abd_to_buf(ub_abd); ub->ub_txg = 0; /* Initialize the 2nd padding area. */ bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); abd_zero(bootenv, VDEV_PAD_SIZE); /* * Write everything in parallel. */ retry: zio = zio_root(spa, NULL, NULL, flags); for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); /* * Skip the 1st padding area. * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ vdev_label_write(zio, vd, l, bootenv, offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } error = zio_wait(zio); if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; } nvlist_free(label); abd_free(bootenv); abd_free(ub_abd); abd_free(vp_abd); /* * If this vdev hasn't been previously identified as a spare, then we * mark it as such only if a) we are labeling it as a spare, or b) it * exists as a spare elsewhere in the system. Do the same for * level 2 ARC devices. */ if (error == 0 && !vd->vdev_isspare && (reason == VDEV_LABEL_SPARE || spa_spare_exists(vd->vdev_guid, NULL, NULL))) spa_spare_add(vd); if (error == 0 && !vd->vdev_isl2cache && (reason == VDEV_LABEL_L2CACHE || spa_l2cache_exists(vd->vdev_guid, NULL))) spa_l2cache_add(vd); return (error); } /* * Done callback for vdev_label_read_bootenv_impl. If this is the first * callback to finish, store our abd in the callback pointer. Otherwise, we * just free our abd and return. */ static void vdev_label_read_bootenv_done(zio_t *zio) { zio_t *rio = zio->io_private; abd_t **cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE); if (zio->io_error == 0) { mutex_enter(&rio->io_lock); if (*cbp == NULL) { /* Will free this buffer in vdev_label_read_bootenv. */ *cbp = zio->io_abd; } else { abd_free(zio->io_abd); } mutex_exit(&rio->io_lock); } else { abd_free(zio->io_abd); } } static void vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) { for (int c = 0; c < vd->vdev_children; c++) vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags); /* * We just use the first label that has a correct checksum; the * bootloader should have rewritten them all to be the same on boot, * and any changes we made since boot have been the same across all * labels. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_read(zio, vd, l, abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE), offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, vdev_label_read_bootenv_done, zio, flags); } } } int vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv) { nvlist_t *config; spa_t *spa = rvd->vdev_spa; abd_t *abd = NULL; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; ASSERT(bootenv); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); zio_t *zio = zio_root(spa, NULL, &abd, flags); vdev_label_read_bootenv_impl(zio, rvd, flags); int err = zio_wait(zio); if (abd != NULL) { char *buf; vdev_boot_envblock_t *vbe = abd_to_buf(abd); vbe->vbe_version = ntohll(vbe->vbe_version); switch (vbe->vbe_version) { case VB_RAW: /* * if we have textual data in vbe_bootenv, create nvlist * with key "envmap". */ fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW); vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; fnvlist_add_string(bootenv, GRUB_ENVMAP, vbe->vbe_bootenv); break; case VB_NVLIST: err = nvlist_unpack(vbe->vbe_bootenv, sizeof (vbe->vbe_bootenv), &config, 0); if (err == 0) { fnvlist_merge(bootenv, config); nvlist_free(config); break; } zfs_fallthrough; default: /* Check for FreeBSD zfs bootonce command string */ buf = abd_to_buf(abd); if (*buf == '\0') { fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_NVLIST); break; } fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf); } /* * abd was allocated in vdev_label_read_bootenv_impl() */ abd_free(abd); /* * If we managed to read any successfully, * return success. */ return (0); } return (err); } int vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) { zio_t *zio; spa_t *spa = vd->vdev_spa; vdev_boot_envblock_t *bootenv; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; int error; size_t nvsize; char *nvbuf; const char *tmp; error = nvlist_size(env, &nvsize, NV_ENCODE_XDR); if (error != 0) return (SET_ERROR(error)); if (nvsize >= sizeof (bootenv->vbe_bootenv)) { return (SET_ERROR(E2BIG)); } ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); error = ENXIO; for (int c = 0; c < vd->vdev_children; c++) { int child_err; child_err = vdev_label_write_bootenv(vd->vdev_child[c], env); /* * As long as any of the disks managed to write all of their * labels successfully, return success. */ if (child_err == 0) error = child_err; } if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) || !vdev_writeable(vd)) { return (error); } ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE); abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); abd_zero(abd, VDEV_PAD_SIZE); bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE); nvbuf = bootenv->vbe_bootenv; nvsize = sizeof (bootenv->vbe_bootenv); bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION); switch (bootenv->vbe_version) { case VB_RAW: if (nvlist_lookup_string(env, GRUB_ENVMAP, &tmp) == 0) { (void) strlcpy(bootenv->vbe_bootenv, tmp, nvsize); } error = 0; break; case VB_NVLIST: error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR, KM_SLEEP); break; default: error = EINVAL; break; } if (error == 0) { bootenv->vbe_version = htonll(bootenv->vbe_version); abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); } else { abd_free(abd); return (SET_ERROR(error)); } retry: zio = zio_root(spa, NULL, NULL, flags); for (int l = 0; l < VDEV_LABELS; l++) { vdev_label_write(zio, vd, l, abd, offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); } error = zio_wait(zio); if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { flags |= ZIO_FLAG_TRYHARD; goto retry; } abd_free(abd); return (error); } /* * ========================================================================== * uberblock load/sync * ========================================================================== */ /* * Consider the following situation: txg is safely synced to disk. We've * written the first uberblock for txg + 1, and then we lose power. When we * come back up, we fail to see the uberblock for txg + 1 because, say, * it was on a mirrored device and the replica to which we wrote txg + 1 * is now offline. If we then make some changes and sync txg + 1, and then * the missing replica comes back, then for a few seconds we'll have two * conflicting uberblocks on disk with the same txg. The solution is simple: * among uberblocks with equal txg, choose the one with the latest timestamp. */ static int vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg); if (likely(cmp)) return (cmp); cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp); if (likely(cmp)) return (cmp); /* * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware * ZFS, e.g. OpenZFS >= 0.7. * * If one ub has MMP and the other does not, they were written by * different hosts, which matters for MMP. So we treat no MMP/no SEQ as * a 0 value. * * Since timestamp and txg are the same if we get this far, either is * acceptable for importing the pool. */ unsigned int seq1 = 0; unsigned int seq2 = 0; if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) seq1 = MMP_SEQ(ub1); if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) seq2 = MMP_SEQ(ub2); return (TREE_CMP(seq1, seq2)); } struct ubl_cbdata { uberblock_t *ubl_ubbest; /* Best uberblock */ vdev_t *ubl_vd; /* vdev associated with the above */ }; static void vdev_uberblock_load_done(zio_t *zio) { vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = abd_to_buf(zio->io_abd); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); if (ub->ub_txg <= spa->spa_load_max_txg && vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { /* * Keep track of the vdev in which this uberblock * was found. We will use this information later * to obtain the config nvlist associated with * this uberblock. */ *cbp->ubl_ubbest = *ub; cbp->ubl_vd = vd; } mutex_exit(&rio->io_lock); } abd_free(zio->io_abd); } static void vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, struct ubl_cbdata *cbp) { for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && vd->vdev_ops != &vdev_draid_spare_ops) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); } } } } /* * Reads the 'best' uberblock from disk along with its associated * configuration. First, we read the uberblock array of each label of each * vdev, keeping track of the uberblock with the highest txg in each array. * Then, we read the configuration from the same vdev as the best uberblock. */ void vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) { zio_t *zio; spa_t *spa = rvd->vdev_spa; struct ubl_cbdata cb; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; ASSERT(ub); ASSERT(config); memset(ub, 0, sizeof (uberblock_t)); *config = NULL; cb.ubl_ubbest = ub; cb.ubl_vd = NULL; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); zio = zio_root(spa, NULL, &cb, flags); vdev_uberblock_load_impl(zio, rvd, flags, &cb); (void) zio_wait(zio); /* * It's possible that the best uberblock was discovered on a label * that has a configuration which was written in a future txg. * Search all labels on this vdev to find the configuration that * matches the txg for our uberblock. */ if (cb.ubl_vd != NULL) { vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); if (*config == NULL && spa->spa_extreme_rewind) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " "Trying again without txg restrictions."); *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX); } if (*config == NULL) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config"); } } spa_config_exit(spa, SCL_ALL, FTAG); } /* * For use when a leaf vdev is expanded. * The location of labels 2 and 3 changed, and at the new location the * uberblock rings are either empty or contain garbage. The sync will write * new configs there because the vdev is dirty, but expansion also needs the * uberblock rings copied. Read them from label 0 which did not move. * * Since the point is to populate labels {2,3} with valid uberblocks, * we zero uberblocks we fail to read or which are not valid. */ static void vdev_copy_uberblocks(vdev_t *vd) { abd_t *ub_abd; zio_t *write_zio; int locks = (SCL_L2ARC | SCL_ZIO); int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_READER) == SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); /* * No uberblocks are stored on distributed spares, they may be * safely skipped when expanding a leaf vdev. */ if (vd->vdev_ops == &vdev_draid_spare_ops) return; spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER); ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); write_zio = zio_root(vd->vdev_spa, NULL, NULL, flags); for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { const int src_label = 0; zio_t *zio; zio = zio_root(vd->vdev_spa, NULL, NULL, flags); vdev_label_read(zio, vd, src_label, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags); if (zio_wait(zio) || uberblock_verify(abd_to_buf(ub_abd))) abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); for (int l = 2; l < VDEV_LABELS; l++) vdev_label_write(write_zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags | ZIO_FLAG_DONT_PROPAGATE); } (void) zio_wait(write_zio); spa_config_exit(vd->vdev_spa, locks, FTAG); abd_free(ub_abd); } /* * On success, increment root zio's count of good writes. * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). */ static void vdev_uberblock_sync_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) atomic_inc_64(good_writes); } /* * Write the uberblock to all labels of all leaves of the specified vdev. */ static void vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, uberblock_t *ub, vdev_t *vd, int flags) { for (uint64_t c = 0; c < vd->vdev_children; c++) { vdev_uberblock_sync(zio, good_writes, ub, vd->vdev_child[c], flags); } if (!vd->vdev_ops->vdev_op_leaf) return; if (!vdev_writeable(vd)) return; /* * There's no need to write uberblocks to a distributed spare, they * are already stored on all the leaves of the parent dRAID. For * this same reason vdev_uberblock_load_impl() skips distributed * spares when reading uberblocks. */ if (vd->vdev_ops == &vdev_draid_spare_ops) return; /* If the vdev was expanded, need to copy uberblock rings. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && vd->vdev_copy_uberblocks == B_TRUE) { vdev_copy_uberblocks(vd); vd->vdev_copy_uberblocks = B_FALSE; } int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m); /* Copy the uberblock_t into the ABD */ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); for (int l = 0; l < VDEV_LABELS; l++) vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); abd_free(ub_abd); } /* Sync the uberblocks to all vdevs in svd[] */ static int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { spa_t *spa = svd[0]->vdev_spa; zio_t *zio; uint64_t good_writes = 0; zio = zio_root(spa, NULL, NULL, flags); for (int v = 0; v < svdcount; v++) vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); (void) zio_wait(zio); /* * Flush the uberblocks to disk. This ensures that the odd labels * are no longer needed (because the new uberblocks and the even * labels are safely on disk), so it is safe to overwrite them. */ zio = zio_root(spa, NULL, NULL, flags); for (int v = 0; v < svdcount; v++) { if (vdev_writeable(svd[v])) { zio_flush(zio, svd[v]); } } (void) zio_wait(zio); return (good_writes >= 1 ? 0 : EIO); } /* * On success, increment the count of good writes for our top-level vdev. */ static void vdev_label_sync_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (zio->io_error == 0) atomic_inc_64(good_writes); } /* * If there weren't enough good writes, indicate failure to the parent. */ static void vdev_label_sync_top_done(zio_t *zio) { uint64_t *good_writes = zio->io_private; if (*good_writes == 0) zio->io_error = SET_ERROR(EIO); kmem_free(good_writes, sizeof (uint64_t)); } /* * We ignore errors for log and cache devices, simply free the private data. */ static void vdev_label_sync_ignore_done(zio_t *zio) { kmem_free(zio->io_private, sizeof (uint64_t)); } /* * Write all even or odd labels to all leaves of the specified vdev. */ static void vdev_label_sync(zio_t *zio, uint64_t *good_writes, vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; char *buf; size_t buflen; for (int c = 0; c < vd->vdev_children; c++) { vdev_label_sync(zio, good_writes, vd->vdev_child[c], l, txg, flags); } if (!vd->vdev_ops->vdev_op_leaf) return; if (!vdev_writeable(vd)) return; /* * The top-level config never needs to be written to a distributed * spare. When read vdev_dspare_label_read_config() will generate * the config for the vdev_label_read_config(). */ if (vd->vdev_ops == &vdev_draid_spare_ops) return; /* * Generate a label describing the top-level config to which we belong. */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); abd_zero(vp_abd, sizeof (vdev_phys_t)); vp = abd_to_buf(vp_abd); buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { for (; l < VDEV_LABELS; l += 2) { vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); } } abd_free(vp_abd); nvlist_free(label); } static int vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) { list_t *dl = &spa->spa_config_dirty_list; vdev_t *vd; zio_t *zio; int error; /* * Write the new labels to disk. */ zio = zio_root(spa, NULL, NULL, flags); for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { uint64_t *good_writes; ASSERT(!vd->vdev_ishole); good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); zio_t *vio = zio_null(zio, spa, NULL, (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); vdev_label_sync(vio, good_writes, vd, l, txg, flags); zio_nowait(vio); } error = zio_wait(zio); /* * Flush the new labels to disk. */ zio = zio_root(spa, NULL, NULL, flags); for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) zio_flush(zio, vd); (void) zio_wait(zio); return (error); } /* * Sync the uberblock and any changes to the vdev configuration. * * The order of operations is carefully crafted to ensure that * if the system panics or loses power at any time, the state on disk * is still transactionally consistent. The in-line comments below * describe the failure semantics at each stage. * * Moreover, vdev_config_sync() is designed to be idempotent: if it fails * at any time, you can just call it again, and it will resume its work. */ int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) { spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; ASSERT(svdcount != 0); retry: /* * Normally, we don't want to try too hard to write every label and * uberblock. If there is a flaky disk, we don't want the rest of the * sync process to block while we retry. But if we can't write a * single label out, we should retry with ZIO_FLAG_TRYHARD before * bailing out and declaring the pool faulted. */ if (error != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) return (error); flags |= ZIO_FLAG_TRYHARD; } ASSERT(ub->ub_txg <= txg); /* * If this isn't a resync due to I/O errors, * and nothing changed in this transaction group, * and the vdev configuration hasn't changed, * then there's nothing to do. */ if (ub->ub_txg < txg) { boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, txg, spa->spa_mmp.mmp_delay); if (!changed && list_is_empty(&spa->spa_config_dirty_list)) return (0); } if (txg > spa_freeze_txg(spa)) return (0); ASSERT(txg <= spa->spa_final_txg); /* * Flush the write cache of every disk that's been written to * in this transaction group. This ensures that all blocks * written in this txg will be committed to stable storage * before any uberblock that references them. */ zio_t *zio = zio_root(spa, NULL, NULL, flags); for (vdev_t *vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL; vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) zio_flush(zio, vd); (void) zio_wait(zio); /* * Sync out the even labels (L0, L2) for every dirty vdev. If the * system dies in the middle of this process, that's OK: all of the * even labels that made it to disk will be newer than any uberblock, * and will therefore be considered invalid. The odd labels (L1, L3), * which have not yet been touched, will still be valid. We flush * the new labels to disk to ensure that all even-label updates * are committed to stable storage before the uberblock update. */ if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_label_sync_list() returned error %d " "for pool '%s' when syncing out the even labels " "of dirty vdevs", error, spa_name(spa)); } goto retry; } /* * Sync the uberblocks to all vdevs in svd[]. * If the system dies in the middle of this step, there are two cases * to consider, and the on-disk state is consistent either way: * * (1) If none of the new uberblocks made it to disk, then the * previous uberblock will be the newest, and the odd labels * (which had not yet been touched) will be valid with respect * to that uberblock. * * (2) If one or more new uberblocks made it to disk, then they * will be the newest, and the even labels (which had all * been successfully committed) will be valid with respect * to the new uberblocks. */ if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_uberblock_sync_list() returned error " "%d for pool '%s'", error, spa_name(spa)); } goto retry; } if (spa_multihost(spa)) mmp_update_uberblock(spa, ub); /* * Sync out odd labels for every dirty vdev. If the system dies * in the middle of this process, the even labels and the new * uberblocks will suffice to open the pool. The next time * the pool is opened, the first thing we'll do -- before any * user data is modified -- is mark every vdev dirty so that * all labels will be brought up to date. We flush the new labels * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) { zfs_dbgmsg("vdev_label_sync_list() returned error %d " "for pool '%s' when syncing out the odd labels of " "dirty vdevs", error, spa_name(spa)); } goto retry; } return (0); } diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index d7b2217623e6..fb8164f0aea9 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -1,5154 +1,5134 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, Datto, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *const zio_type_name[ZIO_TYPES] = { /* * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim" }; int zio_dva_throttle_enabled = B_TRUE; static int zio_deadman_log_all = B_FALSE; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ static kmem_cache_t *zio_cache; static kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif /* Mark IOs as "slow" if they take longer than 30 seconds */ static uint_t zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. * * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable * compression (including of metadata). In practice, we don't have this * many sync passes, so this has no effect. * * The original intent was that disabling compression would help the sync * passes to converge. However, in practice disabling compression increases * the average number of sync passes, because when we turn compression off, a * lot of block's size will change and thus we have to re-allocate (not * overwrite) them. It also increases the number of 128KB allocations (e.g. * for indirect blocks and spacemaps) because these will not be compressed. * The 128K allocations are especially detrimental to performance on highly * fragmented systems, which may have very few free segments of this size, * and may need to load new metaslabs to satisfy 128K allocations. */ /* defer frees starting in this pass */ uint_t zfs_sync_pass_deferred_free = 2; /* don't compress starting in this pass */ static uint_t zfs_sync_pass_dont_compress = 8; /* rewrite new bps starting in this pass */ static uint_t zfs_sync_pass_rewrite = 2; /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) /* * Enable smaller cores by excluding metadata * allocations as well. */ int zio_exclude_metadata = 0; static int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG static const int zio_buf_debug_limit = 16384; #else static const int zio_buf_debug_limit = 0; #endif static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); void zio_init(void) { size_t c; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); /* * For small buffers, we want a cache for each multiple of * SPA_MINBLOCKSIZE. For larger buffers, we want a cache * for each quarter-power of 2. */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t p2 = size; size_t align = 0; size_t data_cflags, cflags; data_cflags = KMC_NODEBUG; cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; while (!ISP2(p2)) p2 &= p2 - 1; #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; /* * Here's the problem - on 4K native devices in userland on * Linux using O_DIRECT, buffers must be 4K aligned or I/O * will fail with EINVAL, causing zdb (and others) to coredump. * Since userland probably doesn't need optimized buffer caches, * we just force 4K alignment on everything. */ align = 8 * SPA_MINBLOCKSIZE; #else if (size < PAGESIZE) { align = SPA_MINBLOCKSIZE; } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = PAGESIZE; } #endif if (align != 0) { char name[36]; if (cflags == data_cflags) { /* * Resulting kmem caches would be identical. * Save memory by creating only one. */ (void) snprintf(name, sizeof (name), "zio_buf_comb_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); zio_data_buf_cache[c] = zio_buf_cache[c]; continue; } (void) snprintf(name, sizeof (name), "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, data_cflags); } } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } zio_inject_init(); lz4_init(); } void zio_fini(void) { size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; #if defined(ZFS_DEBUG) && !defined(_KERNEL) for (size_t i = 0; i < n; i++) { if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i]) (void) printf("zio_fini: [%d] %llu != %llu\n", (int)((i + 1) << SPA_MINBLOCKSHIFT), (long long unsigned)zio_buf_cache_allocs[i], (long long unsigned)zio_buf_cache_frees[i]); } #endif /* * The same kmem cache can show up multiple times in both zio_buf_cache * and zio_data_buf_cache. Do a wasteful but trivially correct scan to * sort it out. */ for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_buf_cache[j]) zio_buf_cache[j] = NULL; if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_data_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { VERIFY3P(zio_buf_cache[i], ==, NULL); VERIFY3P(zio_data_buf_cache[i], ==, NULL); } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); lz4_fini(); } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif kmem_cache_free(zio_buf_cache[c], buf); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); kmem_cache_free(zio_data_buf_cache[c], buf); } static void zio_abd_free(void *abd, size_t size) { (void) size; abd_free((abd_t *)abd); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ void zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_abd = data; zio->io_size = size; } void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) abd_free(zio->io_abd); zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks, decompression, and decryption * ========================================================================== */ static void zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) abd_copy(data, zio->io_abd, size); } static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { void *tmp = abd_borrow_buf(data, size); int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_abd, tmp, zio->io_size, size, &zio->io_prop.zp_complevel); abd_return_buf_copy(data, tmp, size); if (zio_injection_enabled && ret == 0) ret = zio_handle_fault_injection(zio, EINVAL); if (ret != 0) zio->io_error = SET_ERROR(EIO); } } static void zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) { int ret; void *tmp; blkptr_t *bp = zio->io_bp; spa_t *spa = zio->io_spa; uint64_t dsobj = zio->io_bookmark.zb_objset; uint64_t lsize = BP_GET_LSIZE(bp); dmu_object_type_t ot = BP_GET_TYPE(bp); uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(size, !=, 0); if (zio->io_error != 0) return; /* * Verify the cksum of MACs stored in an indirect bp. It will always * be possible to verify this since it does not require an encryption * key. */ if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { zio_crypt_decode_mac_bp(bp, mac); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* * We haven't decompressed the data yet, but * zio_crypt_do_indirect_mac_checksum() requires * decompressed data to be able to parse out the MACs * from the indirect block. We decompress it now and * throw away the result after we are finished. */ tmp = zio_buf_alloc(lsize); ret = zio_decompress_data(BP_GET_COMPRESS(bp), zio->io_abd, tmp, zio->io_size, lsize, &zio->io_prop.zp_complevel); if (ret != 0) { ret = SET_ERROR(EIO); goto error; } ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); zio_buf_free(tmp, lsize); } else { ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); } abd_copy(data, zio->io_abd, size); if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } if (ret != 0) goto error; return; } /* * If this is an authenticated block, just check the MAC. It would be * nice to separate this out into its own flag, but when this was done, * we had run out of bits in what is now zio_flag_t. Future cleanup * could make this a flag bit. */ if (BP_IS_AUTHENTICATED(bp)) { if (ot == DMU_OT_OBJSET) { ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); } else { zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, mac); if (zio_injection_enabled && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } } abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; } zio_crypt_decode_params_bp(bp, salt, iv); if (ot == DMU_OT_INTENT_LOG) { tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmp, mac); abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, mac); } ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, zio->io_abd, &no_crypt); if (no_crypt) abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; error: /* assert that the key was found unless this was speculative */ ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); /* * If there was a decryption / authentication error return EIO as * the io_error. If this was not a speculative zio, create an ereport. */ if (ret == ECKSUM) { zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark, &zio->io_bp->blk_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; } } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ zio_t * zio_walk_parents(zio_t *cio, zio_link_t **zl) { list_t *pl = &cio->io_parent_list; *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_child == cio); return ((*zl)->zl_parent); } zio_t * zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; ASSERT(MUTEX_HELD(&pio->io_lock)); *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_parent == pio); return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(cio, &zl); VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); - pio->io_child_count++; - cio->io_parent_count++; - mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); - pio->io_child_count--; - cio->io_parent_count--; - mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) { boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); for (int c = 0; c < ZIO_CHILD_TYPES; c++) { if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) continue; uint64_t *countp = &zio->io_children[c][wait]; if (*countp != 0) { zio->io_stage >>= 1; ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; break; } } mutex_exit(&zio->io_lock); return (waiting); } __attribute__((always_inline)) static inline void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, zio_t **next_to_executep) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { zio_taskq_type_t type = pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); /* * If we can tell the caller to execute this parent next, do * so. We only do this if the parent's zio type matches the * child's type. Otherwise dispatch the parent zio in its * own taskq. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch * overhead, and has no recursion penalty. Note that one * read from disk typically causes at least 3 zio's: a * zio_null(), the logical zio_read(), and then a physical * zio. When the physical ZIO completes, we are able to call * zio_done() on all 3 of these zio's from one invocation of * zio_execute() by returning the parent back to * zio_execute(). Since the parent isn't executed until this * thread returns back to zio_execute(), the caller should do * so promptly. * * In other cases, dispatching the parent prevents * overflowing the stack when we have deeply nested * parent-child relationships, as we do with the "mega zio" * of writes for spa_sync(), and the chain of ZIL blocks. */ if (next_to_executep != NULL && *next_to_executep == NULL && pio->io_type == zio->io_type) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); } } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } int zio_bookmark_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) return (-1); if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) return (1); if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) return (-1); if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) return (1); if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) return (-1); if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) return (1); if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) return (-1); if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, zio_flag_t flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); memset(zio, 0, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } taskq_init_ent(&zio->io_tqent); return (zio); } void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } static int zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, enum blk_verify_flag blk_verify, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("bad blkptr at %px: " "DVA[0]=%#llx/%#llx " "DVA[1]=%#llx/%#llx " "DVA[2]=%#llx/%#llx " "prop=%#llx " "pad=%#llx,%#llx " "phys_birth=%#llx " "birth=%#llx " "fill=%#llx " "cksum=%#llx/%#llx/%#llx/%#llx", bp, (long long)bp->blk_dva[0].dva_word[0], (long long)bp->blk_dva[0].dva_word[1], (long long)bp->blk_dva[1].dva_word[0], (long long)bp->blk_dva[1].dva_word[1], (long long)bp->blk_dva[2].dva_word[0], (long long)bp->blk_dva[2].dva_word[1], (long long)bp->blk_prop, (long long)bp->blk_pad[0], (long long)bp->blk_pad[1], (long long)bp->blk_phys_birth, (long long)bp->blk_birth, (long long)bp->blk_fill, (long long)bp->blk_cksum.zc_word[0], (long long)bp->blk_cksum.zc_word[1], (long long)bp->blk_cksum.zc_word[2], (long long)bp->blk_cksum.zc_word[3]); switch (blk_verify) { case BLK_VERIFY_HALT: zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: zfs_dbgmsg("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_ONLY: break; } return (1); } /* * Verify the block pointer fields contain reasonable values. This means * it only contains known object types, checksum/compression identifiers, * block sizes within the maximum allowed limits, valid DVAs, etc. * * If everything checks out B_TRUE is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * * Values for blk_verify_flag: * BLK_VERIFY_ONLY: evaluate the block * BLK_VERIFY_LOG: evaluate the block and log problems * BLK_VERIFY_HALT: call zfs_panic_recover on error * * Values for blk_config_flag: * BLK_CONFIG_HELD: caller holds SCL_VDEV for writer * BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be * obtained for reader * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better * performance */ boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { int errors = 0; if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } /* * Do not verify individual DVAs if the config is not trusted. This * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (!spa->spa_trust_config) return (errors == 0); switch (blk_config) { case BLK_CONFIG_HELD: ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); break; case BLK_CONFIG_NEEDED: spa_config_enter(spa, SCL_VDEV, bp, RW_READER); break; case BLK_CONFIG_SKIP: return (errors == 0); default: panic("invalid blk_config %u", blk_config); } /* * Pool-specific checks. * * Note: it would be nice to verify that the blk_birth and * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() * allows the birth time of log blocks (and dmu_sync()-ed blocks * that are in the log) to be arbitrarily large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { const dva_t *dva = &bp->blk_dva[i]; uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } if (blk_config == BLK_CONFIG_NEEDED) spa_config_exit(spa, SCL_VDEV, bp); return (errors == 0); } boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) { (void) bp; uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) return (B_FALSE); vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) return (B_FALSE); if (vd->vdev_ops == &vdev_hole_ops) return (B_FALSE); if (vd->vdev_ops == &vdev_missing_ops) { return (B_FALSE); } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) return (B_FALSE); return (B_TRUE); } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, - zio_done_func_t *physdone, zio_done_func_t *done, - void *private, zio_priority_t priority, zio_flag_t flags, - const zbookmark_phys_t *zb) + zio_done_func_t *done, void *private, zio_priority_t priority, + zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_children_ready = children_ready; - zio->io_physdone = physdone; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). Encrypted * dedup blocks need data as well so we also disable dedup in this * case. */ if (data == NULL && (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite, boolean_t brtwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); ASSERT(!brtwrite || !nopwrite); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_brtwrite = brtwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. * * Note that we only defer frees after zfs_sync_pass_deferred_free * when the log space map feature is disabled. [see relevant comment * in spa_sync_iterate_to_convergence()] */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) || brt_maybe_exists(spa, bp)) { metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); } } /* * To improve performance, this function may return NULL if we were able * to do the free immediately. This avoids the cost of creating a zio * (and linking it to the parent, etc). */ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_flag_t flags) { ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); if (BP_IS_EMBEDDED(bp)) return (NULL); metaslab_check_free(spa, bp); arc_freed(spa, bp); dsl_scan_freed(spa, bp); if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || brt_maybe_exists(spa, bp)) { /* * GANG, DEDUP and BRT blocks can induce a read (for the gang * block header, the DDT or the BRT), so issue them * asynchronously so that this thread is not tied up. */ enum zio_stage stage = ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage)); } else { metaslab_free(spa, bp, txg, B_FALSE); return (NULL); } } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ? BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); } zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, done, private, flags)); } return (zio); } zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, enum trim_flag trim_flags) { zio_t *zio; ASSERT0(vd->vdev_children); ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift)); ASSERT3U(size, !=, 0); zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done, private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE); zio->io_trim_flags = trim_flags; return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ abd_t *wbuf = abd_alloc_sametype(data, size); abd_copy(wbuf, data, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; /* * vdev child I/Os do not propagate their error to the parent. * Therefore, for correct operation the caller *must* check for * and handle the error in the child i/o's done callback. * The only exceptions are i/os that we don't care about * (OPTIONAL or REPAIR). */ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || done != NULL); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { ASSERT0(vd->vdev_children); offset += VDEV_LABEL_START_SIZE; } flags |= ZIO_VDEV_CHILD_FLAGS(pio); /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; /* * If we're creating a child I/O that is not associated with a * top-level vdev, then the child zio is not an allocating I/O. * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); flags &= ~ZIO_FLAG_IO_ALLOCATING; } - zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); - zio->io_physdone = pio->io_physdone; - if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) - zio->io_logical->io_phys_children++; - return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, zio_type_t type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT3P(zio->io_executor, ==, NULL); ASSERT3U(zio->io_orig_size, ==, zio->io_size); ASSERT3U(size, <=, zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) { /* we are not doing a raw write */ ASSERT3U(zio->io_size, ==, zio->io_lsize); zio->io_orig_size = zio->io_size = zio->io_lsize = size; } } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static zio_t * zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || BP_HAS_INDIRECT_MAC_CKSUM(bp)) && zio->io_child_type == ZIO_CHILD_LOGICAL) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decrypt); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (zio); } static zio_t * zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) return (zio); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; ASSERT(bp->blk_birth != zio->io_txg); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zp->zp_brtwrite) return (zio); ASSERT(!BP_GET_DEDUP(zio->io_bp_override)); if (BP_IS_EMBEDDED(bp)) return (zio); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (zio); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (zio); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && !zp->zp_encrypt) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (zio); } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } return (zio); } static zio_t * zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; uint32_t pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { return (NULL); } if (!IO_IS_ALLOCATING(zio)) return (zio); if (zio->io_children_ready != NULL) { /* * Now that all our children are ready, run the callback * associated with this zio in case it wants to modify the * data to be written. */ ASSERT3U(zp->zp_level, >, 0); zio->io_children_ready(zio); } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = NULL; psize = zio_compress_data(compress, zio->io_abd, &cbuf, lsize, zp->zp_complevel); if (psize == 0) { compress = ZIO_COMPRESS_OFF; } else if (psize >= lsize) { compress = ZIO_COMPRESS_OFF; if (cbuf != NULL) zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (zio); } else { /* * Round compressed size up to the minimum allocation * size of the smallest-ashift device, and zero the * tail. This ensures that the compressed size of the * BP (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT); size_t rounded = (size_t)roundup(psize, spa->spa_min_alloc); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); psize = lsize; } else { abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_take_ownership_of_buf(cdata, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cdata, psize, lsize, NULL); } } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && zp->zp_type == DMU_OT_DNODE) { /* * The DMU actually relies on the zio layer's compression * to free metadnode blocks that have had all contained * dnodes freed. As a result, even when doing a raw * receive, we must check whether the block can be compressed * to a hole. */ psize = zio_compress_data(ZIO_COMPRESS_EMPTY, zio->io_abd, NULL, lsize, zp->zp_complevel); if (psize == 0 || psize >= lsize) compress = ZIO_COMPRESS_OFF; } else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) { /* * If we are raw receiving an encrypted dataset we should not * take this codepath because it will change the on-disk block * and decryption will fail. */ size_t rounded = MIN((size_t)roundup(psize, spa->spa_min_alloc), lsize); if (rounded != psize) { abd_t *cdata = abd_alloc_linear(rounded, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); abd_copy_off(cdata, zio->io_abd, 0, 0, psize); psize = rounded; zio_push_transform(zio, cdata, psize, rounded, NULL); } } else { ASSERT3U(psize, !=, 0); } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (zio->io_bp_orig.blk_birth != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!zp->zp_encrypt || DMU_OT_IS_ENCRYPTED(zp->zp_type)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (zio); } static zio_t * zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); return (zio); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available. */ if ((zio->io_priority == ZIO_PRIORITY_NOW || zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags, &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { spa_t *spa = zio->io_spa; taskq_t *tq = taskq_of_curthread(); for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (tqs->stqs_taskq[i] == tq) return (B_TRUE); } } return (B_FALSE); } static zio_t * zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } void zio_interrupt(void *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } void zio_delay_interrupt(zio_t *zio) { /* * The timeout_generic() function isn't defined in userspace, so * rather than trying to implement the function, the zio delay * functionality has been disabled for userspace builds. */ #ifdef _KERNEL /* * If io_target_timestamp is zero, then no delay has been registered * for this IO, thus jump to the end of this function and "skip" the * delay; issuing it directly to the zio layer. */ if (zio->io_target_timestamp != 0) { hrtime_t now = gethrtime(); if (now >= zio->io_target_timestamp) { /* * This IO has already taken longer than the target * delay to complete, so we don't want to delay it * any longer; we "miss" the delay and issue it * directly to the zio layer. This is likely due to * the target latency being set to a value less than * the underlying hardware can satisfy (e.g. delay * set to 1ms, but the disks take 10ms to complete an * IO request). */ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, hrtime_t, now); zio_interrupt(zio); } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; clock_t expire_at_tick = ddi_get_lbolt() + NSEC_TO_TICK(diff); DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); if (NSEC_TO_TICK(diff) == 0) { /* Our delay is less than a jiffy - just spin */ zfs_sleep_until(zio->io_target_timestamp); zio_interrupt(zio); } else { /* * Use taskq_dispatch_delay() in the place of * OpenZFS's timeout_generic(). */ tid = taskq_dispatch_delay(system_taskq, zio_interrupt, zio, TQ_NOSLEEP, expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just * finish the zio without a delay. */ zio_interrupt(zio); } } } return; } #endif DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); zio_interrupt(zio); } static void zio_deadman_impl(zio_t *pio, int ziodepth) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; vdev_t *vd = pio->io_vd; if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) { vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL; zbookmark_phys_t *zb = &pio->io_bookmark; uint64_t delta = gethrtime() - pio->io_timestamp; uint64_t failmode = spa_get_deadman_failmode(pio->io_spa); zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu " "delta=%llu queued=%llu io=%llu " "path=%s " "last=%llu type=%d " "priority=%d flags=0x%llx stage=0x%x " "pipeline=0x%x pipeline-trace=0x%x " "objset=%llu object=%llu " "level=%llu blkid=%llu " "offset=%llu size=%llu " "error=%d", ziodepth, pio, pio->io_timestamp, (u_longlong_t)delta, pio->io_delta, pio->io_delay, vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, pio->io_type, pio->io_priority, (u_longlong_t)pio->io_flags, pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size, pio->io_error); (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, pio->io_spa, vd, zb, pio, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { zio_interrupt(pio); } } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_deadman_impl(cio, ziodepth + 1); } mutex_exit(&pio->io_lock); } /* * Log the critical information describing this zio and all of its children * using the zfs_dbgmsg() interface then post deadman event for the ZED. */ void zio_deadman(zio_t *pio, const char *tag) { spa_t *spa = pio->io_spa; char *name = spa_name(spa); if (!zfs_deadman_enabled || spa_suspended(spa)) return; zio_deadman_impl(pio, 0); switch (spa_get_deadman_failmode(spa)) { case ZIO_FAILURE_MODE_WAIT: zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_CONTINUE: zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_PANIC: fm_panic("%s determined I/O to pool '%s' is hung.", tag, name); break; } } /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for * dependent child I/Os; (3) the I/O issues, so we're waiting * for an I/O completion interrupt; (4) the I/O is delegated by * vdev-level caching or aggregation; (5) the I/O is deferred * due to vdev-level queueing; (6) the I/O is handed off to * another thread. In all cases, the pipeline stops whenever * there's no CPU work; it never burns a thread in cv_wait_io(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; /* * zio_execute() is a wrapper around the static function * __zio_execute() so that we can force __zio_execute() to be * inlined. This reduces stack overhead which is important * because __zio_execute() is called recursively in several zio * code paths. zio_execute() itself cannot be inlined because * it is externally visible. */ void zio_execute(void *zio) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); __zio_execute(zio); spl_fstrans_unmark(cookie); } /* * Used to determine if in the current context the stack is sized large * enough to allow zio_execute() to be called recursively. A minimum * stack size of 16K is required to avoid needing to re-dispatch the zio. */ static boolean_t zio_execute_stack_check(zio_t *zio) { #if !defined(HAVE_LARGE_STACKS) dsl_pool_t *dp = spa_get_dsl(zio->io_spa); /* Executing in txg_sync_thread() context. */ if (dp && curthread == dp->dp_tx.tx_sync_thread) return (B_TRUE); /* Pool initialization outside of zio_taskq context. */ if (dp && spa_is_initializing(dp->dp_spa) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) return (B_TRUE); #else (void) zio; #endif /* HAVE_LARGE_STACKS */ return (B_FALSE); } __attribute__((always_inline)) static inline void __zio_execute(zio_t *zio) { ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; zio->io_executor = curthread; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } /* * If the current context doesn't have large enough stacks * the zio must be issued asynchronously to prevent overflow. */ if (zio_execute_stack_check(zio)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; /* * The zio pipeline stage returns the next zio to execute * (typically the same as this one), or NULL if we should * stop. */ zio = zio_pipeline[highbit64(stage) - 1](zio); if (zio == NULL) return; } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { /* * Some routines, like zio_free_sync(), may return a NULL zio * to avoid the performance overhead of creating and then destroying * an unneeded zio. For the callers' simplicity, we accept a NULL * zio and ignore it. */ if (zio == NULL) return (0); long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); int error; ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); ASSERT3P(zio->io_executor, ==, NULL); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) { error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, ddi_get_lbolt() + timeout); if (zfs_deadman_enabled && error == -1 && gethrtime() - zio->io_queued_timestamp > spa_deadman_ziotime(zio->io_spa)) { mutex_exit(&zio->io_lock); timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms); zio_deadman(zio, FTAG); mutex_enter(&zio->io_lock); } } mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { /* * See comment in zio_wait(). */ if (zio == NULL) return; ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && list_is_empty(&zio->io_parent_list)) { zio_t *pio; /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE]; zio_add_child(pio, zio); } ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); } /* * ========================================================================== * Reexecute, cancel, or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(void *arg) { zio_t *pio = arg; zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ zio_link_t *zl = NULL; mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock); } mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on it. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); } } void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " "failure and has been suspended.\n", spa_name(spa)); (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = reason; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static void zio_gang_issue_func_done(zio_t *zio) { abd_free(zio->io_abd); } static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } static zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio; if (gn != NULL) { abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { abd_t *buf = abd_get_offset(data, offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); abd_free(buf); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio)); if (zio == NULL) { zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)); } return (zio); } static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); - ASSERT(zio->io_child_count == 0); + ASSERT(list_is_empty(&zio->io_child_list)); if (zio->io_error) return; /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); abd_free(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, offset); offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); } static zio_t * zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (zio); } static zio_t * zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 0); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; zio_t *gio __maybe_unused = zio->io_gang_leader; if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static void zio_write_gang_done(zio_t *zio) { /* * The io_abd field will be NULL for a zio with no data. The io_flags * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't * check for it here as it is cleared in zio_ready. */ if (zio->io_abd != NULL) abd_free(zio->io_abd); } static zio_t * zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* * If one copy was requested, store 2 copies of the GBH, so that we * can still traverse all the data (e.g. to free or scrub) even if a * block is damaged. Note that we can't store 3 copies of the GBH in * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt. */ int gbh_copies = copies; if (gbh_copies == 1) { gbh_copies = MIN(2, spa_max_replication(spa)); } int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator]. mca_alloc_slots, pio)); /* * The logical zio has already placed a reservation for * 'copies' allocation slots but gang blocks may require * additional copies. These additional copies * (i.e. gbh_copies - copies) are guaranteed to succeed * since metaslab_class_throttle_reserve() always allows * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio, pio->io_allocator); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * If we failed to allocate the gang block header then * we remove any additional allocation reservations that * we placed here. The original reservation will * be removed when the logical I/O goes to the ready * stage. */ metaslab_class_throttle_unreserve(mc, gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; return (pio); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; memset(gbh, 0, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. */ for (int g = 0; resid != 0; resid -= lsize, g++) { lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_complevel = gio->io_prop.zp_complevel; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, lsize, lsize, &zp, - zio_write_gang_member_ready, NULL, NULL, + zio_write_gang_member_ready, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * Gang children won't throttle but we should * account for their work, so reserve an allocation * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, zp.zp_copies, cio->io_allocator, cio, flags)); } zio_nowait(cio); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; /* * We didn't allocate this bp, so make sure it doesn't get unmarked. */ pio->io_flags &= ~ZIO_FLAG_FASTWRITE; zio_nowait(zio); return (pio); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static zio_t * zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_IS_HOLE(bp)); ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (zio); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop); /* * If we're overwriting a block that is currently on an * indirect vdev, then ignore the nopwrite request and * allow a new block to be allocated on a concrete vdev. */ spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) { vdev_t *tvd = vdev_lookup_top(zio->io_spa, DVA_GET_VDEV(&bp_orig->blk_dva[d])); if (tvd->vdev_ops == &vdev_indirect_ops) { spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); return (zio); } } spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (zio); } /* * ========================================================================== * Block Reference Table * ========================================================================== */ static zio_t * zio_brt_free(zio_t *zio) { blkptr_t *bp; bp = zio->io_bp; if (BP_GET_LEVEL(bp) > 0 || BP_IS_METADATA(bp) || !brt_maybe_exists(zio->io_spa, bp)) { return (zio); } if (!brt_entry_decref(zio->io_spa, bp)) { /* * This isn't the last reference, so we cannot free * the data yet. */ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } return (zio); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ if (zio->io_error == 0 && dde->dde_repair_abd == NULL) dde->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } static zio_t * zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (ddp_self == NULL) return (zio); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (zio); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (zio); } static zio_t * zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (zio); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } if (dde->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (zio); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. * However, we should never get a raw, override zio so in these * cases we can compare the io_abd directly. This is useful because * it allows us to do dedup verification even if we don't have access * to the original data (for instance, if the encryption keys aren't * loaded). */ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; if (lio != NULL && do_raw) { return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); } else if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } } for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; if (ddp->ddp_phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) return (B_TRUE); ddt_exit(ddt); tmpabd = abd_alloc_for_io(psize, B_TRUE); error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_ready(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; if (zio->io_error) return; ddt_enter(ddt); ASSERT(dde->dde_lead_zio[p] == zio); ddt_phys_fill(ddp, zio->io_bp); zio_link_t *zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); } static void zio_ddt_child_write_done(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { zio_link_t *zl = NULL; while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); } ddt_exit(ddt); } static zio_t * zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; zio_t *cio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); } ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); if (dde->dde_lead_zio[p] != NULL) zio_add_child(zio, dde->dde_lead_zio[p]); else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { ASSERT(bp->blk_birth == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, - zio_ddt_child_write_ready, NULL, NULL, + zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } ddt_exit(ddt); zio_nowait(cio); return (zio); } static ddt_entry_t *freedde; /* for debugging */ static zio_t * zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { ddp = ddt_phys_select(dde, bp); if (ddp) ddt_phys_decref(ddp); } ddt_exit(ddt); return (zio); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static zio_t * zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock)); zio = avl_first(&spa->spa_allocs[allocator].spaa_tree); if (zio == NULL) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, zio->io_prop.zp_copies, allocator, zio, 0)) { return (NULL); } avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); } static zio_t * zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; metaslab_class_t *mc; /* locate an appropriate allocation class */ mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (zio); } ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); zbookmark_phys_t *bm = &zio->io_bookmark; /* * We want to try to use as many allocators as possible to help improve * performance, but we also want logically adjacent IOs to be physically * adjacent to improve sequential read performance. We chunk each object * into 2^20 block regions, and then hash based on the objset, object, * level, and region to accomplish both of these goals. */ int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; zio->io_allocator = allocator; zio->io_metaslab_class = mc; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); avl_add(&spa->spa_allocs[allocator].spaa_tree, zio); nio = zio_io_to_allocate(spa, allocator); mutex_exit(&spa->spa_allocs[allocator].spaa_lock); return (nio); } static void zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); zio = zio_io_to_allocate(spa, allocator); mutex_exit(&spa->spa_allocs[allocator].spaa_lock); if (zio == NULL) return; ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); ASSERT0(zio->io_error); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; /* * if not already chosen, locate an appropriate allocation class */ mc = zio->io_metaslab_class; if (mc == NULL) { mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); zio->io_metaslab_class = mc; } /* * Try allocating the block in the usual metaslab class. * If that's full, allocate it in the normal class. * If that's full, allocate as a gang block, * and if all are full, the allocation fails (which shouldn't happen). * * Note that we do not fall back on embedded slog (ZIL) space, to * preserve unfragmented slog space, which is critical for decent * sync write performance. If a log allocation fails, we will fall * back to spa_sync() which is abysmal for performance. */ error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); /* * Fallback to normal class when an alloc class is full */ if (error == ENOSPC && mc != spa_normal_class(spa)) { /* * If throttling, transfer reservation over to normal class. * The io_allocator slot can remain the same even though we * are switching classes. */ if (mc->mc_alloc_throttle_enabled && (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { metaslab_class_throttle_unreserve(mc, zio->io_prop.zp_copies, zio->io_allocator, zio); zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; VERIFY(metaslab_class_throttle_reserve( spa_normal_class(spa), zio->io_prop.zp_copies, zio->io_allocator, zio, flags | METASLAB_MUST_RESERVE)); } zio->io_metaslab_class = mc = spa_normal_class(spa); if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying normal class: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); } if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying ganging: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } return (zio_write_gang_block(zio, mc)); } if (error != 0) { if (error != ENOSPC || (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) { zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " "size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } zio->io_error = error; } return (zio); } static zio_t * zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (zio); } static zio_t * zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (zio); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); /* * Block pointer fields are useful to metaslabs for stats and debugging. * Fill in the obvious ones before calling into metaslab_alloc(). */ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_PSIZE(new_bp, size); BP_SET_LEVEL(new_bp, 0); /* * When allocating a zil block, we don't have information about * the final destination of the block except the objset it's part * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ int flags = METASLAB_FASTWRITE | METASLAB_ZIL; int allocator = (uint_t)cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); *slog = (error == 0); if (error != 0) { error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); } if (error != 0) { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); } metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); /* * encrypted blocks will require an IV and salt. We generate * these now since we will not be rewriting the bp at * rewrite time. */ if (os->os_encrypted) { uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t salt[ZIO_DATA_SALT_LEN]; BP_SET_CRYPT(new_bp, B_TRUE); VERIFY0(spa_crypt_get_salt(spa, dmu_objset_id(os), salt)); VERIFY0(zio_crypt_generate_iv(iv)); zio_crypt_encode_params_bp(new_bp, salt, iv); } } else { zfs_dbgmsg("%s: zil block allocation failure: " "size %llu, error %d", spa_name(spa), (u_longlong_t)size, error); } return (error); } /* * ========================================================================== * Read and write to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static zio_t * zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; zio->io_delay = 0; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (NULL); } ASSERT3P(zio->io_logical, !=, zio); if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(spa->spa_trust_config); /* * Note: the code can handle other kinds of writes, * but we don't expect them. */ if (zio->io_vd->vdev_noalloc) { ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering. * * There are a few ways that we can end up creating these spurious * resilver i/os: * * 1. A resilver i/o will be issued if any DVA in the BP has a * dirty DTL. The mirror code will issue resilver writes to * each DVA, including the one(s) that are not on vdevs with dirty * DTLs. * * 2. With nested replication, which happens when we have a * "replacing" or "spare" vdev that's a child of a mirror or raidz. * For example, given mirror(replacing(A+B), C), it's likely that * only A is out of date (it's the new device). In this case, we'll * read from C, then use the data to resilver A+B -- but we don't * actually want to resilver B, just A. The top-level mirror has no * way to know this, so instead we just discard unnecessary repairs * as we work our way down the vdev tree. * * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. * The same logic applies to any form of nested replication: ditto * + mirror, RAID-Z + replacing, etc. * * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. * * Leaf DTL_PARTIAL can be empty when a legitimate write comes from * a dRAID spare vdev. For example, when a dRAID spare is first * used, its spare blocks need to be written to but the leaf vdev's * of such blocks can have empty DTL_PARTIAL. * * There seemed no clean way to allow such writes while bypassing * spurious ones. At this point, just avoid all bypassing for dRAID * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } /* * Select the next best leaf I/O to process. Distributed spares are * excluded since they dispatch the I/O directly to a leaf vdev after * applying the dRAID mapping. */ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (NULL); } zio->io_delay = gethrtime(); } vd->vdev_ops->vdev_op_io_start(zio); return (NULL); } static zio_t * zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error && vd->vdev_remove_wanted == B_FALSE) VERIFY(vdev_probe(vd, zio) == NULL); return (zio); } /* * This function is used to change the priority of an existing zio that is * currently in-flight. This is used by the arc to upgrade priority in the * event that a demand read is made for a block that is currently queued * as a scrub or async read IO. Otherwise, the high priority read request * would end up having to wait for the lower priority IO. */ void zio_change_priority(zio_t *pio, zio_priority_t priority) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { vdev_queue_change_io_priority(pio, priority); } else { pio->io_priority = priority; } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_change_priority(cio, priority); } mutex_exit(&pio->io_lock); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = abd; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_abd_free; } static zio_t * zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (NULL); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting " "cant_write=TRUE due to write failure with ENXIO", zio); vd->vdev_cant_write = B_TRUE; } /* * If a cache flush returns ENOTSUP or ENOTTY, we know that no future * attempts will ever succeed. In this case we set a persistent * boolean flag so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - if (vd != NULL && vd->vdev_ops->vdev_op_leaf && - zio->io_physdone != NULL) { - ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); - ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); - zio->io_physdone(zio->io_logical); - } - return (zio); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Encrypt and store encryption parameters * ========================================================================== */ /* * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for * managing the storage of encryption parameters and passing them to the * lower-level encryption functions. */ static zio_t * zio_encrypt(zio_t *zio) { zio_prop_t *zp = &zio->io_prop; spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_GET_PSIZE(bp); uint64_t dsobj = zio->io_bookmark.zb_objset; dmu_object_type_t ot = BP_GET_TYPE(bp); void *enc_buf = NULL; abd_t *eabd = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* the root zio already encrypted the data */ if (zio->io_child_type == ZIO_CHILD_GANG) return (zio); /* only ZIL blocks are re-encrypted on rewrite */ if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) return (zio); if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { BP_SET_CRYPT(bp, B_FALSE); return (zio); } /* if we are doing raw encryption set the provided encryption params */ if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { ASSERT0(BP_GET_LEVEL(bp)); BP_SET_CRYPT(bp, B_TRUE); BP_SET_BYTEORDER(bp, zp->zp_byteorder); if (ot != DMU_OT_OBJSET) zio_crypt_encode_mac_bp(bp, zp->zp_mac); /* dnode blocks must be written out in the provided byteorder */ if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && ot == DMU_OT_DNODE) { void *bswap_buf = zio_buf_alloc(psize); abd_t *babd = abd_get_from_buf(bswap_buf, psize); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); abd_copy_to_buf(bswap_buf, zio->io_abd, psize); dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, psize); abd_take_ownership_of_buf(babd, B_TRUE); zio_push_transform(zio, babd, psize, psize, NULL); } if (DMU_OT_IS_ENCRYPTED(ot)) zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); return (zio); } /* indirect blocks only maintain a cksum of the lower level MACs */ if (BP_GET_LEVEL(bp) > 0) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Objset blocks are a special case since they have 2 256-bit MACs * embedded within them. */ if (ot == DMU_OT_OBJSET) { ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); return (zio); } /* unencrypted object types are only authenticated with a MAC */ if (!DMU_OT_IS_ENCRYPTED(ot)) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Later passes of sync-to-convergence may decide to rewrite data * in place to avoid more disk reallocations. This presents a problem * for encryption because this constitutes rewriting the new data with * the same encryption key and IV. However, this only applies to blocks * in the MOS (particularly the spacemaps) and we do not encrypt the * MOS. We assert that the zio is allocating or an intent log write * to enforce this. */ ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); ASSERT3U(psize, !=, 0); enc_buf = zio_buf_alloc(psize); eabd = abd_get_from_buf(enc_buf, psize); abd_take_ownership_of_buf(eabd, B_TRUE); /* * For an explanation of what encryption parameters are stored * where, see the block comment in zio_crypt.c. */ if (ot == DMU_OT_INTENT_LOG) { zio_crypt_decode_params_bp(bp, salt, iv); } else { BP_SET_CRYPT(bp, B_TRUE); } /* Perform the encryption. This should not fail */ VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); /* encode encryption metadata into the bp */ if (ot == DMU_OT_INTENT_LOG) { /* * ZIL blocks store the MAC in the embedded checksum, so the * transform must always be applied. */ zio_crypt_encode_mac_zil(enc_buf, mac); zio_push_transform(zio, eabd, psize, psize, NULL); } else { BP_SET_CRYPT(bp, B_TRUE); zio_crypt_encode_params_bp(bp, salt, iv); zio_crypt_encode_mac_bp(bp, mac); if (no_crypt) { ASSERT3U(ot, ==, DMU_OT_DNODE); abd_free(eabd); } else { zio_push_transform(zio, eabd, psize, psize, NULL); } } return (zio); } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static zio_t * zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (zio); } static zio_t * zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_checksum_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, &info); } } return (zio); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static zio_t * zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ metaslab_class_throttle_unreserve( zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio); zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (bp != NULL && BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (zio); } /* * Update the allocation throttle accounting. */ static void zio_dva_throttle_done(zio_t *zio) { zio_t *lio __maybe_unused = zio->io_logical; zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT(vd != NULL); ASSERT3P(vd, ==, vd->vdev_top); ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); /* * Parents of gang children can have two flavors -- ones that * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) * and ones that allocated the constituent blocks. The allocation * throttle needs to know the allocating parent zio so we must find * it here. */ if (pio->io_child_type == ZIO_CHILD_GANG) { /* * If our parent is a rewrite gang child then our grandparent * would have been the one that performed the allocation. */ if (pio->io_flags & ZIO_FLAG_IO_REWRITE) pio = zio_unique_parent(pio); flags |= METASLAB_GANG_CHILD; } ASSERT(IO_IS_ALLOCATING(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); ASSERT(zio->io_metaslab_class != NULL); mutex_enter(&pio->io_lock); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } static zio_t * zio_done(zio_t *zio) { /* * Always attempt to keep stack usage minimal here since * we can be called recursively up to 19 levels deep. */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; zio_link_t *zl = NULL; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { return (NULL); } /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { ASSERT(zio->io_metaslab_class != NULL); ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } /* * If the allocation throttle is enabled, verify that * we have decremented the refcounts for every I/O that was throttled. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class-> mc_allocator[zio->io_allocator].mca_alloc_slots, zio)); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); abd_t *adata = zio->io_abd; if (adata != NULL && asize != psize) { adata = abd_alloc(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, adata); zfs_ereport_free_checksum(zcr); if (adata != NULL && asize != psize) abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { /* * We want to only increment our slow IO counters if * the IO is valid (i.e. not if the drive is removed). * * zfs_ereport_post() will also do these checks, but * it can also ratelimit and have other failures, so we * need to increment the slow_io counters independent * of it. */ if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_slow_ios++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); } } } if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { mutex_enter(&zio->io_vd->vdev_stat_lock); if (zio->io_type == ZIO_TYPE_READ) zio->io_vd->vdev_stat.vs_read_errors++; else if (zio->io_type == ZIO_TYPE_WRITE) zio->io_vd->vdev_stat.vs_write_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); } } if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark, &zio->io_bp->blk_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } if (zio->io_error && zio == zio->io_logical) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); /* * This is a rare code path, so we don't * bother with "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; /* * This is a rare code path, so we don't bother with * "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, zio_reexecute, zio, 0, &zio->io_tqent); } return (NULL); } - ASSERT(zio->io_child_count == 0); + ASSERT(list_is_empty(&zio->io_child_list)); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * We are done executing this zio. We may want to execute a parent * next. See the comment in zio_notify_parent(). */ zio_t *next_to_execute = NULL; zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (next_to_execute); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_write_bp_init, zio_free_bp_init, zio_issue_async, zio_write_compress, zio_encrypt, zio_checksum_generate, zio_nop_write, zio_brt_free, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT); IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT0(last_block->zb_level); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } /* * This function is similar to zbookmark_subtree_completed(), but returns true * if subtree_root is equal or ahead of last_block, i.e. still to be done. */ boolean_t zbookmark_subtree_tbd(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { ASSERT0(last_block->zb_level); if (dnp == NULL) return (B_FALSE); return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root, last_block) >= 0); } EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW, "Prioritize requeued I/O"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, UINT, ZMOD_RW, "Defer frees starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW, "Don't compress starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW, "Rewrite new bps starting in this pass"); ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, "Throttle block allocations in the ZIO pipeline"); ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, "Log all slow ZIOs, not just those with vdevs"); diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in index 9517ce8073a5..cf438e0e6495 100755 --- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in @@ -1,457 +1,458 @@ #!/usr/bin/env @PYTHON_SHEBANG@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2017 by Delphix. All rights reserved. # Copyright (c) 2018 by Lawrence Livermore National Security, LLC. # # This script must remain compatible with Python 3.6+. # import os import re import sys import argparse # # This script parses the stdout of zfstest, which has this format: # # Test: /path/to/testa (run as root) [00:00] [PASS] # Test: /path/to/testb (run as jkennedy) [00:00] [PASS] # Test: /path/to/testc (run as root) [00:00] [FAIL] # [...many more results...] # # Results Summary # FAIL 22 # SKIP 32 # PASS 1156 # # Running Time: 02:50:31 # Percent passed: 95.5% # Log directory: /var/tmp/test_results/20180615T205926 # # # Common generic reasons for a test or test group to be skipped. # # Some test cases are known to fail in ways which are not harmful or dangerous. # In these cases simply mark the test as a known failure until it can be # updated and the issue resolved. Note that it's preferable to open a unique # issue on the GitHub issue tracker for each test case failure. # known_reason = 'Known issue' # # Some tests require that a test user be able to execute the zfs utilities. # This may not be possible when testing in-tree due to the default permissions # on the user's home directory. When testing this can be resolved by granting # group read access. # # chmod 0750 $HOME # exec_reason = 'Test user execute permissions required for utilities' # # Some tests require that the kernel supports renameat2 syscall. # renameat2_reason = 'Kernel renameat2 support required' # # Some tests require the O_TMPFILE flag which was first introduced in the # 3.11 kernel. # tmpfile_reason = 'Kernel O_TMPFILE support required' # # Some tests require the statx(2) system call on Linux which was first # introduced in the 4.11 kernel. # statx_reason = 'Kernel statx(2) system call required on Linux' # # Some tests require that the lsattr utility support the project id feature. # project_id_reason = 'lsattr with set/show project ID required' # # Some tests require that the kernel support user namespaces. # user_ns_reason = 'Kernel user namespace support required' # # Some rewind tests can fail since nothing guarantees that old MOS blocks # are not overwritten. Snapshots protect datasets and data files but not # the MOS. Reasonable efforts are made in the test case to increase the # odds that some txgs will have their MOS data left untouched, but it is # never a sure thing. # rewind_reason = 'Arbitrary pool rewind is not guaranteed' # # Some tests require a minimum version of the fio benchmark utility. # Older distributions such as CentOS 6.x only provide fio-2.0.13. # fio_reason = 'Fio v2.3 or newer required' # # Some tests require that the DISKS provided support the discard operation. # Normally this is not an issue because loop back devices are used for DISKS # and they support discard (TRIM/UNMAP). # trim_reason = 'DISKS must support discard (TRIM/UNMAP)' # # Some tests on FreeBSD require the fspacectl(2) system call and the # truncate(1) utility supporting the -d option. The system call was first # introduced in FreeBSD version 1400032. # fspacectl_reason = 'fspacectl(2) and truncate -d support required' # # Some tests are not applicable to a platform or need to be updated to operate # in the manor required by the platform. Any tests which are skipped for this # reason will be suppressed in the final analysis output. # na_reason = "Not applicable" # # Some test cases doesn't have all requirements to run on Github actions CI. # ci_reason = 'CI runner doesn\'t have all requirements' # # Idmapped mount is only supported in kernel version >= 5.12 # idmap_reason = 'Idmapped mount needs kernel 5.12+' # # These tests are known to fail, thus we use this list to prevent these # failures from failing the job as a whole; only unexpected failures # bubble up to cause this script to exit with a non-zero exit status. # # Format: { 'test-name': ['expected result', 'issue-number | reason'] } # # For each known failure it is recommended to link to a GitHub issue by # setting the reason to the issue number. Alternately, one of the generic # reasons listed above can be used. # known = { 'casenorm/mixed_none_lookup_ci': ['FAIL', 7633], 'casenorm/mixed_formd_lookup_ci': ['FAIL', 7633], 'cli_root/zpool_import/import_rewind_device_replaced': ['FAIL', rewind_reason], 'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason], 'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason], 'pool_checkpoint/checkpoint_discard_busy': ['SKIP', 12053], 'privilege/setup': ['SKIP', na_reason], 'refreserv/refreserv_004_pos': ['FAIL', known_reason], 'rootpool/setup': ['SKIP', na_reason], 'rsend/rsend_008_pos': ['SKIP', 6066], 'vdev_zaps/vdev_zaps_007_pos': ['FAIL', known_reason], } if sys.platform.startswith('freebsd'): known.update({ 'cli_root/zfs_receive/receive-o-x_props_override': ['FAIL', known_reason], 'cli_root/zpool_resilver/zpool_resilver_concurrent': ['SKIP', na_reason], 'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], 'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason], 'link_count/link_count_001': ['SKIP', na_reason], 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], + 'rsend/send_raw_ashift': ['SKIP', 14961], }) elif sys.platform.startswith('linux'): known.update({ 'casenorm/mixed_formd_lookup': ['FAIL', 7633], 'casenorm/mixed_formd_delete': ['FAIL', 7633], 'casenorm/sensitive_formd_lookup': ['FAIL', 7633], 'casenorm/sensitive_formd_delete': ['FAIL', 7633], 'removal/removal_with_zdb': ['SKIP', known_reason], 'cli_root/zfs_unshare/zfs_unshare_002_pos': ['SKIP', na_reason], }) # # These tests may occasionally fail or be skipped. We want there failures # to be reported but only unexpected failures should bubble up to cause # this script to exit with a non-zero exit status. # # Format: { 'test-name': ['expected result', 'issue-number | reason'] } # # For each known failure it is recommended to link to a GitHub issue by # setting the reason to the issue number. Alternately, one of the generic # reasons listed above can be used. # maybe = { 'append/threadsappend_001_pos': ['FAIL', 6136], 'chattr/setup': ['SKIP', exec_reason], 'crtime/crtime_001_pos': ['SKIP', statx_reason], 'cli_root/zdb/zdb_006_pos': ['FAIL', known_reason], 'cli_root/zfs_destroy/zfs_destroy_dev_removal_condense': ['FAIL', known_reason], 'cli_root/zfs_get/zfs_get_004_pos': ['FAIL', known_reason], 'cli_root/zfs_get/zfs_get_009_pos': ['SKIP', 5479], 'cli_root/zfs_rollback/zfs_rollback_001_pos': ['FAIL', known_reason], 'cli_root/zfs_rollback/zfs_rollback_002_pos': ['FAIL', known_reason], 'cli_root/zfs_snapshot/zfs_snapshot_002_neg': ['FAIL', known_reason], 'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason], 'cli_root/zpool_add/zpool_add_004_pos': ['FAIL', known_reason], 'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', 6145], 'cli_root/zpool_import/zpool_import_missing_003_pos': ['SKIP', 6839], 'cli_root/zpool_initialize/zpool_initialize_import_export': ['FAIL', 11948], 'cli_root/zpool_labelclear/zpool_labelclear_removed': ['FAIL', known_reason], 'cli_root/zpool_trim/setup': ['SKIP', trim_reason], 'cli_root/zpool_upgrade/zpool_upgrade_004_pos': ['FAIL', 6141], 'delegate/setup': ['SKIP', exec_reason], 'fallocate/fallocate_punch-hole': ['SKIP', fspacectl_reason], 'history/history_004_pos': ['FAIL', 7026], 'history/history_005_neg': ['FAIL', 6680], 'history/history_006_neg': ['FAIL', 5657], 'history/history_008_pos': ['FAIL', known_reason], 'history/history_010_pos': ['SKIP', exec_reason], 'io/mmap': ['SKIP', fio_reason], 'largest_pool/largest_pool_001_pos': ['FAIL', known_reason], 'mmp/mmp_on_uberblocks': ['FAIL', known_reason], 'pam/setup': ['SKIP', "pamtester might be not available"], 'pool_checkpoint/checkpoint_discard_busy': ['FAIL', 11946], 'projectquota/setup': ['SKIP', exec_reason], 'removal/removal_condense_export': ['FAIL', known_reason], 'renameat2/setup': ['SKIP', renameat2_reason], 'reservation/reservation_008_pos': ['FAIL', 7741], 'reservation/reservation_018_pos': ['FAIL', 5642], 'snapshot/clone_001_pos': ['FAIL', known_reason], 'snapshot/snapshot_009_pos': ['FAIL', 7961], 'snapshot/snapshot_010_pos': ['FAIL', 7961], 'snapused/snapused_004_pos': ['FAIL', 5513], 'tmpfile/setup': ['SKIP', tmpfile_reason], 'trim/setup': ['SKIP', trim_reason], 'upgrade/upgrade_projectquota_001_pos': ['SKIP', project_id_reason], 'user_namespace/setup': ['SKIP', user_ns_reason], 'userquota/setup': ['SKIP', exec_reason], 'vdev_zaps/vdev_zaps_004_pos': ['FAIL', known_reason], 'zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos': ['FAIL', 5848], } if sys.platform.startswith('freebsd'): maybe.update({ 'cli_root/zfs_copies/zfs_copies_002_pos': ['FAIL', known_reason], 'cli_root/zfs_inherit/zfs_inherit_001_neg': ['FAIL', known_reason], 'cli_root/zfs_share/zfs_share_concurrent_shares': ['FAIL', known_reason], 'cli_root/zpool_import/zpool_import_012_pos': ['FAIL', known_reason], 'delegate/zfs_allow_003_pos': ['FAIL', known_reason], 'inheritance/inherit_001_pos': ['FAIL', 11829], 'resilver/resilver_restart_001': ['FAIL', known_reason], 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], 'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623], 'snapshot/snapshot_002_pos': ['FAIL', '14831'], }) elif sys.platform.startswith('linux'): maybe.update({ 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], 'fault/auto_online_002_pos': ['FAIL', 11889], 'fault/auto_replace_001_pos': ['FAIL', 14851], 'fault/auto_spare_002_pos': ['FAIL', 11889], 'fault/auto_spare_multiple': ['FAIL', 11889], 'fault/auto_spare_shared': ['FAIL', 11889], 'fault/decompress_fault': ['FAIL', 11889], 'io/io_uring': ['SKIP', 'io_uring support required'], 'limits/filesystem_limit': ['SKIP', known_reason], 'limits/snapshot_limit': ['SKIP', known_reason], 'mmp/mmp_active_import': ['FAIL', known_reason], 'mmp/mmp_exported_import': ['FAIL', known_reason], 'mmp/mmp_inactive_import': ['FAIL', known_reason], 'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621], 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], 'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872], 'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872], 'idmap_mount/idmap_mount_001': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_002': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_004': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_005': ['SKIP', idmap_reason], }) # Not all Github actions runners have scsi_debug module, so we may skip # some tests which use it. if os.environ.get('CI') == 'true': known.update({ 'cli_root/zpool_expand/zpool_expand_001_pos': ['SKIP', ci_reason], 'cli_root/zpool_expand/zpool_expand_003_neg': ['SKIP', ci_reason], 'cli_root/zpool_expand/zpool_expand_005_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/setup': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_001_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_002_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_004_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_005_pos': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_006_neg': ['SKIP', ci_reason], 'cli_root/zpool_reopen/zpool_reopen_007_pos': ['SKIP', ci_reason], 'cli_root/zpool_split/zpool_split_wholedisk': ['SKIP', ci_reason], 'fault/auto_offline_001_pos': ['SKIP', ci_reason], 'fault/auto_online_001_pos': ['SKIP', ci_reason], 'fault/auto_online_002_pos': ['SKIP', ci_reason], 'fault/auto_replace_001_pos': ['SKIP', ci_reason], 'fault/auto_spare_ashift': ['SKIP', ci_reason], 'fault/auto_spare_shared': ['SKIP', ci_reason], 'procfs/pool_state': ['SKIP', ci_reason], }) maybe.update({ 'events/events_002_pos': ['FAIL', 11546], }) def process_results(pathname): try: f = open(pathname) except IOError as e: print('Error opening file:', e) sys.exit(1) prefix = '/zfs-tests/tests/(?:functional|perf/regression)/' pattern = \ r'^Test(?:\s+\(\S+\))?:' + \ rf'\s*\S*{prefix}(\S+)' + \ r'\s*\(run as (\S+)\)\s*\[(\S+)\]\s*\[(\S+)\]' pattern_log = r'^\s*Log directory:\s*(\S*)' d = {} logdir = 'Could not determine log directory.' for line in f.readlines(): m = re.match(pattern, line) if m and len(m.groups()) == 4: d[m.group(1)] = m.group(4) continue m = re.match(pattern_log, line) if m: logdir = m.group(1) return d, logdir class ListMaybesAction(argparse.Action): def __init__(self, option_strings, dest="SUPPRESS", default="SUPPRESS", help="list flaky tests and exit"): super(ListMaybesAction, self).__init__( option_strings=option_strings, dest=dest, default=default, nargs=0, help=help) def __call__(self, parser, namespace, values, option_string=None): for test in maybe: print(test) sys.exit(0) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Analyze ZTS logs') parser.add_argument('logfile') parser.add_argument('--list-maybes', action=ListMaybesAction) parser.add_argument('--no-maybes', action='store_false', dest='maybes') args = parser.parse_args() results, logdir = process_results(args.logfile) if not results: print("\n\nNo test results were found.") print("Log directory:", logdir) sys.exit(0) expected = [] unexpected = [] all_maybes = True for test in list(results.keys()): if results[test] == "PASS": continue setup = test.replace(os.path.basename(test), "setup") if results[test] == "SKIP" and test != setup: if setup in known and known[setup][0] == "SKIP": continue if setup in maybe and maybe[setup][0] == "SKIP": continue if (test in known and results[test] in known[test][0]): expected.append(test) elif test in maybe and results[test] in maybe[test][0]: if results[test] == 'SKIP' or args.maybes: expected.append(test) elif not args.maybes: unexpected.append(test) else: unexpected.append(test) all_maybes = False print("\nTests with results other than PASS that are expected:") for test in sorted(expected): issue_url = 'https://github.com/openzfs/zfs/issues/' # Include the reason why the result is expected, given the following: # 1. Suppress test results which set the "Not applicable" reason. # 2. Numerical reasons are assumed to be GitHub issue numbers. # 3. When an entire test group is skipped only report the setup reason. if test in known: if known[test][1] == na_reason: continue elif isinstance(known[test][1], int): expect = f"{issue_url}{known[test][1]}" else: expect = known[test][1] elif test in maybe: if isinstance(maybe[test][1], int): expect = f"{issue_url}{maybe[test][1]}" else: expect = maybe[test][1] elif setup in known and known[setup][0] == "SKIP" and setup != test: continue elif setup in maybe and maybe[setup][0] == "SKIP" and setup != test: continue else: expect = "UNKNOWN REASON" print(f" {results[test]} {test} ({expect})") print("\nTests with result of PASS that are unexpected:") for test in sorted(known.keys()): # We probably should not be silently ignoring the case # where "test" is not in "results". if test not in results or results[test] != "PASS": continue print(f" {results[test]} {test} (expected {known[test][0]})") print("\nTests with results other than PASS that are unexpected:") for test in sorted(unexpected): expect = "PASS" if test not in known else known[test][0] print(f" {results[test]} {test} (expected {expect})") if len(unexpected) == 0: sys.exit(0) elif not args.maybes and all_maybes: sys.exit(2) else: sys.exit(1) diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib index 133f8387ddaf..844caa17d8ed 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib @@ -1,3875 +1,3875 @@ # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2009, Sun Microsystems Inc. All rights reserved. # Copyright (c) 2012, 2020, Delphix. All rights reserved. # Copyright (c) 2017, Tim Chase. All rights reserved. # Copyright (c) 2017, Nexenta Systems Inc. All rights reserved. # Copyright (c) 2017, Lawrence Livermore National Security LLC. # Copyright (c) 2017, Datto Inc. All rights reserved. # Copyright (c) 2017, Open-E Inc. All rights reserved. # Copyright (c) 2021, The FreeBSD Foundation. # Use is subject to license terms. # . ${STF_SUITE}/include/tunables.cfg . ${STF_TOOLS}/include/logapi.shlib . ${STF_SUITE}/include/math.shlib . ${STF_SUITE}/include/blkdev.shlib # # Apply constrained path when available. This is required since the # PATH may have been modified by sudo's secure_path behavior. # if [ -n "$STF_PATH" ]; then export PATH="$STF_PATH" fi # # Generic dot version comparison function # # Returns success when version $1 is greater than or equal to $2. # function compare_version_gte { [ "$(printf "$1\n$2" | sort -V | tail -n1)" = "$1" ] } # Linux kernel version comparison function # # $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version # # Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ] # function linux_version { typeset ver="$1" [ -z "$ver" ] && ver=$(uname -r | grep -Eo "^[0-9]+\.[0-9]+\.[0-9]+") typeset version major minor _ IFS='.' read -r version major minor _ <<<"$ver" [ -z "$version" ] && version=0 [ -z "$major" ] && major=0 [ -z "$minor" ] && minor=0 echo $((version * 100000 + major * 1000 + minor)) } # Determine if this is a Linux test system # # Return 0 if platform Linux, 1 if otherwise function is_linux { [ "$UNAME" = "Linux" ] } # Determine if this is an illumos test system # # Return 0 if platform illumos, 1 if otherwise function is_illumos { [ "$UNAME" = "illumos" ] } # Determine if this is a FreeBSD test system # # Return 0 if platform FreeBSD, 1 if otherwise function is_freebsd { [ "$UNAME" = "FreeBSD" ] } # Determine if this is a 32-bit system # # Return 0 if platform is 32-bit, 1 if otherwise function is_32bit { [ $(getconf LONG_BIT) = "32" ] } # Determine if kmemleak is enabled # # Return 0 if kmemleak is enabled, 1 if otherwise function is_kmemleak { is_linux && [ -e /sys/kernel/debug/kmemleak ] } # Determine whether a dataset is mounted # # $1 dataset name # $2 filesystem type; optional - defaulted to zfs # # Return 0 if dataset is mounted; 1 if unmounted; 2 on error function ismounted { typeset fstype=$2 [[ -z $fstype ]] && fstype=zfs typeset out dir name case $fstype in zfs) if [[ "$1" == "/"* ]] ; then ! zfs mount | awk -v fs="$1" '$2 == fs {exit 1}' else ! zfs mount | awk -v ds="$1" '$1 == ds {exit 1}' fi ;; ufs|nfs) if is_freebsd; then mount -pt $fstype | while read dev dir _t _flags; do [[ "$1" == "$dev" || "$1" == "$dir" ]] && return 0 done else out=$(df -F $fstype $1 2>/dev/null) || return dir=${out%%\(*} dir=${dir%% *} name=${out##*\(} name=${name%%\)*} name=${name%% *} [[ "$1" == "$dir" || "$1" == "$name" ]] && return 0 fi ;; ext*) df -t $fstype $1 > /dev/null 2>&1 ;; zvol) if [[ -L "$ZVOL_DEVDIR/$1" ]]; then link=$(readlink -f $ZVOL_DEVDIR/$1) [[ -n "$link" ]] && \ mount | grep -q "^$link" && \ return 0 fi ;; *) false ;; esac } # Return 0 if a dataset is mounted; 1 otherwise # # $1 dataset name # $2 filesystem type; optional - defaulted to zfs function mounted { ismounted $1 $2 } # Return 0 if a dataset is unmounted; 1 otherwise # # $1 dataset name # $2 filesystem type; optional - defaulted to zfs function unmounted { ! ismounted $1 $2 } function default_setup { default_setup_noexit "$@" log_pass } function default_setup_no_mountpoint { default_setup_noexit "$1" "$2" "$3" "yes" log_pass } # # Given a list of disks, setup storage pools and datasets. # function default_setup_noexit { typeset disklist=$1 typeset container=$2 typeset volume=$3 typeset no_mountpoint=$4 log_note begin default_setup_noexit if is_global_zone; then if poolexists $TESTPOOL ; then destroy_pool $TESTPOOL fi [[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL log_must zpool create -f $TESTPOOL $disklist else reexport_pool fi rm -rf $TESTDIR || log_unresolved Could not remove $TESTDIR mkdir -p $TESTDIR || log_unresolved Could not create $TESTDIR log_must zfs create $TESTPOOL/$TESTFS if [[ -z $no_mountpoint ]]; then log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS fi if [[ -n $container ]]; then rm -rf $TESTDIR1 || \ log_unresolved Could not remove $TESTDIR1 mkdir -p $TESTDIR1 || \ log_unresolved Could not create $TESTDIR1 log_must zfs create $TESTPOOL/$TESTCTR log_must zfs set canmount=off $TESTPOOL/$TESTCTR log_must zfs create $TESTPOOL/$TESTCTR/$TESTFS1 if [[ -z $no_mountpoint ]]; then log_must zfs set mountpoint=$TESTDIR1 \ $TESTPOOL/$TESTCTR/$TESTFS1 fi fi if [[ -n $volume ]]; then if is_global_zone ; then log_must zfs create -V $VOLSIZE $TESTPOOL/$TESTVOL block_device_wait else log_must zfs create $TESTPOOL/$TESTVOL fi fi } # # Given a list of disks, setup a storage pool, file system and # a container. # function default_container_setup { typeset disklist=$1 default_setup "$disklist" "true" } # # Given a list of disks, setup a storage pool,file system # and a volume. # function default_volume_setup { typeset disklist=$1 default_setup "$disklist" "" "true" } # # Given a list of disks, setup a storage pool,file system, # a container and a volume. # function default_container_volume_setup { typeset disklist=$1 default_setup "$disklist" "true" "true" } # # Create a snapshot on a filesystem or volume. Defaultly create a snapshot on # filesystem # # $1 Existing filesystem or volume name. Default, $TESTPOOL/$TESTFS # $2 snapshot name. Default, $TESTSNAP # function create_snapshot { typeset fs_vol=${1:-$TESTPOOL/$TESTFS} typeset snap=${2:-$TESTSNAP} [[ -z $fs_vol ]] && log_fail "Filesystem or volume's name is undefined." [[ -z $snap ]] && log_fail "Snapshot's name is undefined." if snapexists $fs_vol@$snap; then log_fail "$fs_vol@$snap already exists." fi datasetexists $fs_vol || \ log_fail "$fs_vol must exist." log_must zfs snapshot $fs_vol@$snap } # # Create a clone from a snapshot, default clone name is $TESTCLONE. # # $1 Existing snapshot, $TESTPOOL/$TESTFS@$TESTSNAP is default. # $2 Clone name, $TESTPOOL/$TESTCLONE is default. # function create_clone # snapshot clone { typeset snap=${1:-$TESTPOOL/$TESTFS@$TESTSNAP} typeset clone=${2:-$TESTPOOL/$TESTCLONE} [[ -z $snap ]] && \ log_fail "Snapshot name is undefined." [[ -z $clone ]] && \ log_fail "Clone name is undefined." log_must zfs clone $snap $clone } # # Create a bookmark of the given snapshot. Defaultly create a bookmark on # filesystem. # # $1 Existing filesystem or volume name. Default, $TESTFS # $2 Existing snapshot name. Default, $TESTSNAP # $3 bookmark name. Default, $TESTBKMARK # function create_bookmark { typeset fs_vol=${1:-$TESTFS} typeset snap=${2:-$TESTSNAP} typeset bkmark=${3:-$TESTBKMARK} [[ -z $fs_vol ]] && log_fail "Filesystem or volume's name is undefined." [[ -z $snap ]] && log_fail "Snapshot's name is undefined." [[ -z $bkmark ]] && log_fail "Bookmark's name is undefined." if bkmarkexists $fs_vol#$bkmark; then log_fail "$fs_vol#$bkmark already exists." fi datasetexists $fs_vol || \ log_fail "$fs_vol must exist." snapexists $fs_vol@$snap || \ log_fail "$fs_vol@$snap must exist." log_must zfs bookmark $fs_vol@$snap $fs_vol#$bkmark } # # Create a temporary clone result of an interrupted resumable 'zfs receive' # $1 Destination filesystem name. Must not exist, will be created as the result # of this function along with its %recv temporary clone # $2 Source filesystem name. Must not exist, will be created and destroyed # function create_recv_clone { typeset recvfs="$1" typeset sendfs="${2:-$TESTPOOL/create_recv_clone}" typeset snap="$sendfs@snap1" typeset incr="$sendfs@snap2" typeset mountpoint="$TESTDIR/create_recv_clone" typeset sendfile="$TESTDIR/create_recv_clone.zsnap" [[ -z $recvfs ]] && log_fail "Recv filesystem's name is undefined." datasetexists $recvfs && log_fail "Recv filesystem must not exist." datasetexists $sendfs && log_fail "Send filesystem must not exist." log_must zfs create -o compression=off -o mountpoint="$mountpoint" $sendfs log_must zfs snapshot $snap log_must eval "zfs send $snap | zfs recv -u $recvfs" log_must mkfile 1m "$mountpoint/data" log_must zfs snapshot $incr log_must eval "zfs send -i $snap $incr | dd bs=10K count=1 \ iflag=fullblock > $sendfile" log_mustnot eval "zfs recv -su $recvfs < $sendfile" destroy_dataset "$sendfs" "-r" log_must rm -f "$sendfile" if [[ $(get_prop 'inconsistent' "$recvfs/%recv") -ne 1 ]]; then log_fail "Error creating temporary $recvfs/%recv clone" fi } function default_mirror_setup { default_mirror_setup_noexit $1 $2 $3 log_pass } # # Given a pair of disks, set up a storage pool and dataset for the mirror # @parameters: $1 the primary side of the mirror # $2 the secondary side of the mirror # @uses: ZPOOL ZFS TESTPOOL TESTFS function default_mirror_setup_noexit { readonly func="default_mirror_setup_noexit" typeset primary=$1 typeset secondary=$2 [[ -z $primary ]] && \ log_fail "$func: No parameters passed" [[ -z $secondary ]] && \ log_fail "$func: No secondary partition passed" [[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL log_must zpool create -f $TESTPOOL mirror $@ log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS } # # Destroy the configured testpool mirrors. # the mirrors are of the form ${TESTPOOL}{number} # @uses: ZPOOL ZFS TESTPOOL function destroy_mirrors { default_cleanup_noexit log_pass } function default_raidz_setup { default_raidz_setup_noexit "$*" log_pass } # # Given a minimum of two disks, set up a storage pool and dataset for the raid-z # $1 the list of disks # function default_raidz_setup_noexit { typeset disklist="$*" disks=(${disklist[*]}) if [[ ${#disks[*]} -lt 2 ]]; then log_fail "A raid-z requires a minimum of two disks." fi [[ -d /$TESTPOOL ]] && rm -rf /$TESTPOOL log_must zpool create -f $TESTPOOL raidz $disklist log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS } # # Common function used to cleanup storage pools and datasets. # # Invoked at the start of the test suite to ensure the system # is in a known state, and also at the end of each set of # sub-tests to ensure errors from one set of tests doesn't # impact the execution of the next set. function default_cleanup { default_cleanup_noexit log_pass } # # Utility function used to list all available pool names. # # NOTE: $KEEP is a variable containing pool names, separated by a newline # character, that must be excluded from the returned list. # function get_all_pools { zpool list -H -o name | grep -Fvx "$KEEP" | grep -v "$NO_POOLS" } function default_cleanup_noexit { typeset pool="" # # Destroying the pool will also destroy any # filesystems it contains. # if is_global_zone; then zfs unmount -a > /dev/null 2>&1 ALL_POOLS=$(get_all_pools) # Here, we loop through the pools we're allowed to # destroy, only destroying them if it's safe to do # so. while [ ! -z ${ALL_POOLS} ] do for pool in ${ALL_POOLS} do if safe_to_destroy_pool $pool ; then destroy_pool $pool fi done ALL_POOLS=$(get_all_pools) done zfs mount -a else typeset fs="" for fs in $(zfs list -H -o name \ | grep "^$ZONE_POOL/$ZONE_CTR[01234]/"); do destroy_dataset "$fs" "-Rf" done # Need cleanup here to avoid garbage dir left. for fs in $(zfs list -H -o name); do [[ $fs == /$ZONE_POOL ]] && continue [[ -d $fs ]] && log_must rm -rf $fs/* done # # Reset the $ZONE_POOL/$ZONE_CTR[01234] file systems property to # the default value # for fs in $(zfs list -H -o name); do if [[ $fs == $ZONE_POOL/$ZONE_CTR[01234] ]]; then log_must zfs set reservation=none $fs log_must zfs set recordsize=128K $fs log_must zfs set mountpoint=/$fs $fs typeset enc=$(get_prop encryption $fs) if [ -z "$enc" ] || [ "$enc" = "off" ]; then log_must zfs set checksum=on $fs fi log_must zfs set compression=off $fs log_must zfs set atime=on $fs log_must zfs set devices=off $fs log_must zfs set exec=on $fs log_must zfs set setuid=on $fs log_must zfs set readonly=off $fs log_must zfs set snapdir=hidden $fs log_must zfs set aclmode=groupmask $fs log_must zfs set aclinherit=secure $fs fi done fi [[ -d $TESTDIR ]] && \ log_must rm -rf $TESTDIR disk1=${DISKS%% *} if is_mpath_device $disk1; then delete_partitions fi rm -f $TEST_BASE_DIR/{err,out} } # # Common function used to cleanup storage pools, file systems # and containers. # function default_container_cleanup { if ! is_global_zone; then reexport_pool fi ismounted $TESTPOOL/$TESTCTR/$TESTFS1 && log_must zfs unmount $TESTPOOL/$TESTCTR/$TESTFS1 destroy_dataset "$TESTPOOL/$TESTCTR/$TESTFS1" "-R" destroy_dataset "$TESTPOOL/$TESTCTR" "-Rf" [[ -e $TESTDIR1 ]] && \ log_must rm -rf $TESTDIR1 default_cleanup } # # Common function used to cleanup snapshot of file system or volume. Default to # delete the file system's snapshot # # $1 snapshot name # function destroy_snapshot { typeset snap=${1:-$TESTPOOL/$TESTFS@$TESTSNAP} if ! snapexists $snap; then log_fail "'$snap' does not exist." fi # # For the sake of the value which come from 'get_prop' is not equal # to the really mountpoint when the snapshot is unmounted. So, firstly # check and make sure this snapshot's been mounted in current system. # typeset mtpt="" if ismounted $snap; then mtpt=$(get_prop mountpoint $snap) fi destroy_dataset "$snap" [[ $mtpt != "" && -d $mtpt ]] && \ log_must rm -rf $mtpt } # # Common function used to cleanup clone. # # $1 clone name # function destroy_clone { typeset clone=${1:-$TESTPOOL/$TESTCLONE} if ! datasetexists $clone; then log_fail "'$clone' does not existed." fi # With the same reason in destroy_snapshot typeset mtpt="" if ismounted $clone; then mtpt=$(get_prop mountpoint $clone) fi destroy_dataset "$clone" [[ $mtpt != "" && -d $mtpt ]] && \ log_must rm -rf $mtpt } # # Common function used to cleanup bookmark of file system or volume. Default # to delete the file system's bookmark. # # $1 bookmark name # function destroy_bookmark { typeset bkmark=${1:-$TESTPOOL/$TESTFS#$TESTBKMARK} if ! bkmarkexists $bkmark; then log_fail "'$bkmarkp' does not existed." fi destroy_dataset "$bkmark" } # Return 0 if a snapshot exists; $? otherwise # # $1 - snapshot name function snapexists { zfs list -H -t snapshot "$1" > /dev/null 2>&1 } # # Return 0 if a bookmark exists; $? otherwise # # $1 - bookmark name # function bkmarkexists { zfs list -H -t bookmark "$1" > /dev/null 2>&1 } # # Return 0 if a hold exists; $? otherwise # # $1 - hold tag # $2 - snapshot name # function holdexists { ! zfs holds "$2" | awk -v t="$1" '$2 ~ t { exit 1 }' } # # Set a property to a certain value on a dataset. # Sets a property of the dataset to the value as passed in. # @param: # $1 dataset who's property is being set # $2 property to set # $3 value to set property to # @return: # 0 if the property could be set. # non-zero otherwise. # @use: ZFS # function dataset_setprop { typeset fn=dataset_setprop if (($# < 3)); then log_note "$fn: Insufficient parameters (need 3, had $#)" return 1 fi typeset output= output=$(zfs set $2=$3 $1 2>&1) typeset rv=$? if ((rv != 0)); then log_note "Setting property on $1 failed." log_note "property $2=$3" log_note "Return Code: $rv" log_note "Output: $output" return $rv fi return 0 } # # Check a numeric assertion # @parameter: $@ the assertion to check # @output: big loud notice if assertion failed # @use: log_fail # function assert { (($@)) || log_fail "$@" } # # Function to format partition size of a disk # Given a disk cxtxdx reduces all partitions # to 0 size # function zero_partitions # { typeset diskname=$1 typeset i if is_freebsd; then gpart destroy -F $diskname elif is_linux; then DSK=$DEV_DSKDIR/$diskname DSK=$(echo $DSK | sed -e "s|//|/|g") log_must parted $DSK -s -- mklabel gpt blockdev --rereadpt $DSK 2>/dev/null block_device_wait else for i in 0 1 3 4 5 6 7 do log_must set_partition $i "" 0mb $diskname done fi return 0 } # # Given a slice, size and disk, this function # formats the slice to the specified size. # Size should be specified with units as per # the `format` command requirements eg. 100mb 3gb # # NOTE: This entire interface is problematic for the Linux parted utility # which requires the end of the partition to be specified. It would be # best to retire this interface and replace it with something more flexible. # At the moment a best effort is made. # # arguments: function set_partition { typeset -i slicenum=$1 typeset start=$2 typeset size=$3 typeset disk=${4#$DEV_DSKDIR/} disk=${disk#$DEV_RDSKDIR/} case "$UNAME" in Linux) if [[ -z $size || -z $disk ]]; then log_fail "The size or disk name is unspecified." fi disk=$DEV_DSKDIR/$disk typeset size_mb=${size%%[mMgG]} size_mb=${size_mb%%[mMgG][bB]} if [[ ${size:1:1} == 'g' ]]; then ((size_mb = size_mb * 1024)) fi # Create GPT partition table when setting slice 0 or # when the device doesn't already contain a GPT label. parted $disk -s -- print 1 >/dev/null typeset ret_val=$? if [[ $slicenum -eq 0 || $ret_val -ne 0 ]]; then if ! parted $disk -s -- mklabel gpt; then log_note "Failed to create GPT partition table on $disk" return 1 fi fi # When no start is given align on the first cylinder. if [[ -z "$start" ]]; then start=1 fi # Determine the cylinder size for the device and using # that calculate the end offset in cylinders. typeset -i cly_size_kb=0 cly_size_kb=$(parted -m $disk -s -- unit cyl print | awk -F '[:k.]' 'NR == 3 {print $4}') ((end = (size_mb * 1024 / cly_size_kb) + start)) parted $disk -s -- \ mkpart part$slicenum ${start}cyl ${end}cyl typeset ret_val=$? if [[ $ret_val -ne 0 ]]; then log_note "Failed to create partition $slicenum on $disk" return 1 fi blockdev --rereadpt $disk 2>/dev/null block_device_wait $disk ;; FreeBSD) if [[ -z $size || -z $disk ]]; then log_fail "The size or disk name is unspecified." fi disk=$DEV_DSKDIR/$disk if [[ $slicenum -eq 0 ]] || ! gpart show $disk >/dev/null 2>&1; then gpart destroy -F $disk >/dev/null 2>&1 if ! gpart create -s GPT $disk; then log_note "Failed to create GPT partition table on $disk" return 1 fi fi typeset index=$((slicenum + 1)) if [[ -n $start ]]; then start="-b $start" fi gpart add -t freebsd-zfs $start -s $size -i $index $disk if [[ $ret_val -ne 0 ]]; then log_note "Failed to create partition $slicenum on $disk" return 1 fi block_device_wait $disk ;; *) if [[ -z $slicenum || -z $size || -z $disk ]]; then log_fail "The slice, size or disk name is unspecified." fi typeset format_file=/var/tmp/format_in.$$ echo "partition" >$format_file echo "$slicenum" >> $format_file echo "" >> $format_file echo "" >> $format_file echo "$start" >> $format_file echo "$size" >> $format_file echo "label" >> $format_file echo "" >> $format_file echo "q" >> $format_file echo "q" >> $format_file format -e -s -d $disk -f $format_file typeset ret_val=$? rm -f $format_file ;; esac if [[ $ret_val -ne 0 ]]; then log_note "Unable to format $disk slice $slicenum to $size" return 1 fi return 0 } # # Delete all partitions on all disks - this is specifically for the use of multipath # devices which currently can only be used in the test suite as raw/un-partitioned # devices (ie a zpool cannot be created on a whole mpath device that has partitions) # function delete_partitions { typeset disk if [[ -z $DISKSARRAY ]]; then DISKSARRAY=$DISKS fi if is_linux; then typeset -i part for disk in $DISKSARRAY; do for (( part = 1; part < MAX_PARTITIONS; part++ )); do typeset partition=${disk}${SLICE_PREFIX}${part} parted $DEV_DSKDIR/$disk -s rm $part > /dev/null 2>&1 if lsblk | grep -qF ${partition}; then log_fail "Partition ${partition} not deleted" else log_note "Partition ${partition} deleted" fi done done elif is_freebsd; then for disk in $DISKSARRAY; do if gpart destroy -F $disk; then log_note "Partitions for ${disk} deleted" else log_fail "Partitions for ${disk} not deleted" fi done fi } # # Get the end cyl of the given slice # function get_endslice # { typeset disk=$1 typeset slice=$2 if [[ -z $disk || -z $slice ]] ; then log_fail "The disk name or slice number is unspecified." fi case "$UNAME" in Linux) endcyl=$(parted -s $DEV_DSKDIR/$disk -- unit cyl print | \ awk "/part${slice}/"' {sub(/cyl/, "", $3); print $3}') ((endcyl = (endcyl + 1))) ;; FreeBSD) disk=${disk#/dev/zvol/} disk=${disk%p*} slice=$((slice + 1)) endcyl=$(gpart show $disk | \ awk -v slice=$slice '$3 == slice { print $1 + $2 }') ;; *) disk=${disk#/dev/dsk/} disk=${disk#/dev/rdsk/} disk=${disk%s*} typeset -i ratio=0 ratio=$(prtvtoc /dev/rdsk/${disk}s2 | \ awk '/sectors\/cylinder/ {print $2}') if ((ratio == 0)); then return fi typeset -i endcyl=$(prtvtoc -h /dev/rdsk/${disk}s2 | awk -v token="$slice" '$1 == token {print $6}') ((endcyl = (endcyl + 1) / ratio)) ;; esac echo $endcyl } # # Given a size,disk and total slice number, this function formats the # disk slices from 0 to the total slice number with the same specified # size. # function partition_disk # { typeset -i i=0 typeset slice_size=$1 typeset disk_name=$2 typeset total_slices=$3 typeset cyl zero_partitions $disk_name while ((i < $total_slices)); do if ! is_linux; then if ((i == 2)); then ((i = i + 1)) continue fi fi log_must set_partition $i "$cyl" $slice_size $disk_name cyl=$(get_endslice $disk_name $i) ((i = i+1)) done } # # This function continues to write to a filenum number of files into dirnum # number of directories until either file_write returns an error or the # maximum number of files per directory have been written. # # Usage: # fill_fs [destdir] [dirnum] [filenum] [bytes] [num_writes] [data] # # Return value: 0 on success # non 0 on error # # Where : # destdir: is the directory where everything is to be created under # dirnum: the maximum number of subdirectories to use, -1 no limit # filenum: the maximum number of files per subdirectory # bytes: number of bytes to write # num_writes: number of types to write out bytes # data: the data that will be written # # E.g. # fill_fs /testdir 20 25 1024 256 0 # # Note: bytes * num_writes equals the size of the testfile # function fill_fs # destdir dirnum filenum bytes num_writes data { typeset destdir=${1:-$TESTDIR} typeset -i dirnum=${2:-50} typeset -i filenum=${3:-50} typeset -i bytes=${4:-8192} typeset -i num_writes=${5:-10240} typeset data=${6:-0} mkdir -p $destdir/{1..$dirnum} for f in $destdir/{1..$dirnum}/$TESTFILE{1..$filenum}; do file_write -o create -f $f -b $bytes -c $num_writes -d $data \ || return done } # Get the specified dataset property in parsable format or fail function get_prop # property dataset { typeset prop=$1 typeset dataset=$2 zfs get -Hpo value "$prop" "$dataset" || log_fail "zfs get $prop $dataset" } # Get the specified pool property in parsable format or fail function get_pool_prop # property pool { typeset prop=$1 typeset pool=$2 zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool" } # Return 0 if a pool exists; $? otherwise # # $1 - pool name function poolexists { typeset pool=$1 if [[ -z $pool ]]; then log_note "No pool name given." return 1 fi zpool get name "$pool" > /dev/null 2>&1 } # Return 0 if all the specified datasets exist; $? otherwise # # $1-n dataset name function datasetexists { if (($# == 0)); then log_note "No dataset name given." return 1 fi zfs get name "$@" > /dev/null 2>&1 } # return 0 if none of the specified datasets exists, otherwise return 1. # # $1-n dataset name function datasetnonexists { if (($# == 0)); then log_note "No dataset name given." return 1 fi while (($# > 0)); do zfs list -H -t filesystem,snapshot,volume $1 > /dev/null 2>&1 \ && return 1 shift done return 0 } # FreeBSD breaks exports(5) at whitespace and doesn't process escapes # Solaris just breaks # # cf. https://github.com/openzfs/zfs/pull/13165#issuecomment-1059845807 # # Linux can have spaces (which are \OOO-escaped), # but can't have backslashes because they're parsed recursively function shares_can_have_whitespace { is_linux } function is_shared_freebsd { typeset fs=$1 pgrep -q mountd && showmount -E | grep -qx "$fs" } function is_shared_illumos { typeset fs=$1 typeset mtpt for mtpt in `share | awk '{print $2}'` ; do if [[ $mtpt == $fs ]] ; then return 0 fi done typeset stat=$(svcs -H -o STA nfs/server:default) if [[ $stat != "ON" ]]; then log_note "Current nfs/server status: $stat" fi return 1 } function is_shared_linux { typeset fs=$1 ! exportfs -s | awk -v fs="${fs//\\/\\\\}" '/^\// && $1 == fs {exit 1}' } # # Given a mountpoint, or a dataset name, determine if it is shared via NFS. # # Returns 0 if shared, 1 otherwise. # function is_shared { typeset fs=$1 typeset mtpt if [[ $fs != "/"* ]] ; then if datasetnonexists "$fs" ; then return 1 else mtpt=$(get_prop mountpoint "$fs") case "$mtpt" in none|legacy|-) return 1 ;; *) fs=$mtpt ;; esac fi fi case "$UNAME" in FreeBSD) is_shared_freebsd "$fs" ;; Linux) is_shared_linux "$fs" ;; *) is_shared_illumos "$fs" ;; esac } function is_exported_illumos { typeset fs=$1 typeset mtpt _ while read -r mtpt _; do [ "$mtpt" = "$fs" ] && return done < /etc/dfs/sharetab return 1 } function is_exported_freebsd { typeset fs=$1 typeset mtpt _ while read -r mtpt _; do [ "$mtpt" = "$fs" ] && return done < /etc/zfs/exports return 1 } function is_exported_linux { typeset fs=$1 typeset mtpt _ while read -r mtpt _; do [ "$(printf "$mtpt")" = "$fs" ] && return done < /etc/exports.d/zfs.exports return 1 } # # Given a mountpoint, or a dataset name, determine if it is exported via # the os-specific NFS exports file. # # Returns 0 if exported, 1 otherwise. # function is_exported { typeset fs=$1 typeset mtpt if [[ $fs != "/"* ]] ; then if datasetnonexists "$fs" ; then return 1 else mtpt=$(get_prop mountpoint "$fs") case $mtpt in none|legacy|-) return 1 ;; *) fs=$mtpt ;; esac fi fi case "$UNAME" in FreeBSD) is_exported_freebsd "$fs" ;; Linux) is_exported_linux "$fs" ;; *) is_exported_illumos "$fs" ;; esac } # # Given a dataset name determine if it is shared via SMB. # # Returns 0 if shared, 1 otherwise. # function is_shared_smb { typeset fs=$1 datasetexists "$fs" || return if is_linux; then net usershare list | grep -xFq "${fs//[-\/]/_}" else log_note "SMB on $UNAME currently unsupported by the test framework" return 1 fi } # # Given a mountpoint, determine if it is not shared via NFS. # # Returns 0 if not shared, 1 otherwise. # function not_shared { ! is_shared $1 } # # Given a dataset determine if it is not shared via SMB. # # Returns 0 if not shared, 1 otherwise. # function not_shared_smb { ! is_shared_smb $1 } # # Helper function to unshare a mountpoint. # function unshare_fs #fs { typeset fs=$1 if is_shared $fs || is_shared_smb $fs; then log_must zfs unshare $fs fi } # # Helper function to share a NFS mountpoint. # function share_nfs #fs { typeset fs=$1 is_shared "$fs" && return case "$UNAME" in Linux) log_must exportfs "*:$fs" ;; FreeBSD) typeset mountd read -r mountd < /var/run/mountd.pid log_must eval "printf '%s\t\n' \"$fs\" >> /etc/zfs/exports" log_must kill -s HUP "$mountd" ;; *) log_must share -F nfs "$fs" ;; esac return 0 } # # Helper function to unshare a NFS mountpoint. # function unshare_nfs #fs { typeset fs=$1 ! is_shared "$fs" && return case "$UNAME" in Linux) log_must exportfs -u "*:$fs" ;; FreeBSD) typeset mountd read -r mountd < /var/run/mountd.pid awk -v fs="${fs//\\/\\\\}" '$1 != fs' /etc/zfs/exports > /etc/zfs/exports.$$ log_must mv /etc/zfs/exports.$$ /etc/zfs/exports log_must kill -s HUP "$mountd" ;; *) log_must unshare -F nfs $fs ;; esac return 0 } # # Helper function to show NFS shares. # function showshares_nfs { case "$UNAME" in Linux) exportfs -v ;; FreeBSD) showmount ;; *) share -F nfs ;; esac } function check_nfs { case "$UNAME" in Linux) exportfs -s ;; FreeBSD) showmount -e ;; *) log_unsupported "Unknown platform" ;; esac || log_unsupported "The NFS utilities are not installed" } # # Check NFS server status and trigger it online. # function setup_nfs_server { # Cannot share directory in non-global zone. # if ! is_global_zone; then log_note "Cannot trigger NFS server by sharing in LZ." return fi if is_linux; then # # Re-synchronize /var/lib/nfs/etab with /etc/exports and # /etc/exports.d./* to provide a clean test environment. # log_must exportfs -r log_note "NFS server must be started prior to running ZTS." return elif is_freebsd; then log_must kill -s HUP $(/dev/null) [ $cur_zone = "global" ] fi } # # Verify whether test is permitted to run from # global zone, local zone, or both # # $1 zone limit, could be "global", "local", or "both"(no limit) # # Return 0 if permitted, otherwise exit with log_unsupported # function verify_runnable # zone limit { typeset limit=$1 [[ -z $limit ]] && return 0 if is_global_zone ; then case $limit in global|both) ;; local) log_unsupported "Test is unable to run from "\ "global zone." ;; *) log_note "Warning: unknown limit $limit - " \ "use both." ;; esac else case $limit in local|both) ;; global) log_unsupported "Test is unable to run from "\ "local zone." ;; *) log_note "Warning: unknown limit $limit - " \ "use both." ;; esac reexport_pool fi return 0 } # Return 0 if create successfully or the pool exists; $? otherwise # Note: In local zones, this function should return 0 silently. # # $1 - pool name # $2-n - [keyword] devs_list function create_pool #pool devs_list { typeset pool=${1%%/*} shift if [[ -z $pool ]]; then log_note "Missing pool name." return 1 fi if poolexists $pool ; then destroy_pool $pool fi if is_global_zone ; then [[ -d /$pool ]] && rm -rf /$pool log_must zpool create -f $pool $@ fi return 0 } # Return 0 if destroy successfully or the pool exists; $? otherwise # Note: In local zones, this function should return 0 silently. # # $1 - pool name # Destroy pool with the given parameters. function destroy_pool #pool { typeset pool=${1%%/*} typeset mtpt if [[ -z $pool ]]; then log_note "No pool name given." return 1 fi if is_global_zone ; then if poolexists "$pool" ; then mtpt=$(get_prop mountpoint "$pool") # At times, syseventd/udev activity can cause attempts # to destroy a pool to fail with EBUSY. We retry a few # times allowing failures before requiring the destroy # to succeed. log_must_busy zpool destroy -f $pool [[ -d $mtpt ]] && \ log_must rm -rf $mtpt else log_note "Pool does not exist. ($pool)" return 1 fi fi return 0 } # Return 0 if created successfully; $? otherwise # # $1 - dataset name # $2-n - dataset options function create_dataset #dataset dataset_options { typeset dataset=$1 shift if [[ -z $dataset ]]; then log_note "Missing dataset name." return 1 fi if datasetexists $dataset ; then destroy_dataset $dataset fi log_must zfs create $@ $dataset return 0 } # Return 0 if destroy successfully or the dataset exists; $? otherwise # Note: In local zones, this function should return 0 silently. # # $1 - dataset name # $2 - custom arguments for zfs destroy # Destroy dataset with the given parameters. function destroy_dataset # dataset [args] { typeset dataset=$1 typeset mtpt typeset args=${2:-""} if [[ -z $dataset ]]; then log_note "No dataset name given." return 1 fi if is_global_zone ; then if datasetexists "$dataset" ; then mtpt=$(get_prop mountpoint "$dataset") log_must_busy zfs destroy $args $dataset [ -d $mtpt ] && log_must rm -rf $mtpt else log_note "Dataset does not exist. ($dataset)" return 1 fi fi return 0 } # # Reexport TESTPOOL & TESTPOOL(1-4) # function reexport_pool { typeset -i cntctr=5 typeset -i i=0 while ((i < cntctr)); do if ((i == 0)); then TESTPOOL=$ZONE_POOL/$ZONE_CTR$i if ! ismounted $TESTPOOL; then log_must zfs mount $TESTPOOL fi else eval TESTPOOL$i=$ZONE_POOL/$ZONE_CTR$i if eval ! ismounted \$TESTPOOL$i; then log_must eval zfs mount \$TESTPOOL$i fi fi ((i += 1)) done } # # Verify a given disk or pool state # # Return 0 is pool/disk matches expected state, 1 otherwise # function check_state # pool disk state{online,offline,degraded} { typeset pool=$1 typeset disk=${2#$DEV_DSKDIR/} typeset state=$3 [[ -z $pool ]] || [[ -z $state ]] \ && log_fail "Arguments invalid or missing" if [[ -z $disk ]]; then #check pool state only zpool get -H -o value health $pool | grep -qi "$state" else zpool status -v $pool | grep "$disk" | grep -qi "$state" fi } # # Get the mountpoint of snapshot # For the snapshot use /.zfs/snapshot/ # as its mountpoint # function snapshot_mountpoint { typeset dataset=${1:-$TESTPOOL/$TESTFS@$TESTSNAP} if [[ $dataset != *@* ]]; then log_fail "Error name of snapshot '$dataset'." fi typeset fs=${dataset%@*} typeset snap=${dataset#*@} if [[ -z $fs || -z $snap ]]; then log_fail "Error name of snapshot '$dataset'." fi echo $(get_prop mountpoint $fs)/.zfs/snapshot/$snap } # # Given a device and 'ashift' value verify it's correctly set on every label # function verify_ashift # device ashift { typeset device="$1" typeset ashift="$2" zdb -e -lll $device | awk -v ashift=$ashift ' /ashift: / { if (ashift != $2) exit 1; else count++; } END { exit (count != 4); }' } # # Given a pool and file system, this function will verify the file system # using the zdb internal tool. Note that the pool is exported and imported # to ensure it has consistent state. # function verify_filesys # pool filesystem dir { typeset pool="$1" typeset filesys="$2" typeset zdbout="/tmp/zdbout.$$" shift shift typeset dirs=$@ typeset search_path="" log_note "Calling zdb to verify filesystem '$filesys'" zfs unmount -a > /dev/null 2>&1 log_must zpool export $pool if [[ -n $dirs ]] ; then for dir in $dirs ; do search_path="$search_path -d $dir" done fi log_must zpool import $search_path $pool if ! zdb -cudi $filesys > $zdbout 2>&1; then log_note "Output: zdb -cudi $filesys" cat $zdbout rm -f $zdbout log_fail "zdb detected errors with: '$filesys'" fi log_must zfs mount -a log_must rm -rf $zdbout } # # Given a pool issue a scrub and verify that no checksum errors are reported. # function verify_pool { typeset pool=${1:-$TESTPOOL} log_must zpool scrub $pool log_must wait_scrubbed $pool typeset -i cksum=$(zpool status $pool | awk ' !NF { isvdev = 0 } isvdev { errors += $NF } /CKSUM$/ { isvdev = 1 } END { print errors } ') if [[ $cksum != 0 ]]; then log_must zpool status -v log_fail "Unexpected CKSUM errors found on $pool ($cksum)" fi } # # Given a pool, and this function list all disks in the pool # function get_disklist # pool { echo $(zpool iostat -v $1 | awk '(NR > 4) {print $1}' | \ grep -vEe '^-----' -e "^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|\-[0-9]$") } # # Given a pool, and this function list all disks in the pool with their full # path (like "/dev/sda" instead of "sda"). # function get_disklist_fullpath # pool { get_disklist "-P $1" } # /** # This function kills a given list of processes after a time period. We use # this in the stress tests instead of STF_TIMEOUT so that we can have processes # run for a fixed amount of time, yet still pass. Tests that hit STF_TIMEOUT # would be listed as FAIL, which we don't want : we're happy with stress tests # running for a certain amount of time, then finishing. # # @param $1 the time in seconds after which we should terminate these processes # @param $2..$n the processes we wish to terminate. # */ function stress_timeout { typeset -i TIMEOUT=$1 shift typeset cpids="$@" log_note "Waiting for child processes($cpids). " \ "It could last dozens of minutes, please be patient ..." log_must sleep $TIMEOUT log_note "Killing child processes after ${TIMEOUT} stress timeout." typeset pid for pid in $cpids; do ps -p $pid > /dev/null 2>&1 && log_must kill -USR1 $pid done } # # Verify a given hotspare disk is inuse or avail # # Return 0 is pool/disk matches expected state, 1 otherwise # function check_hotspare_state # pool disk state{inuse,avail} { typeset pool=$1 typeset disk=${2#$DEV_DSKDIR/} typeset state=$3 cur_state=$(get_device_state $pool $disk "spares") [ $state = $cur_state ] } # # Wait until a hotspare transitions to a given state or times out. # # Return 0 when pool/disk matches expected state, 1 on timeout. # function wait_hotspare_state # pool disk state timeout { typeset pool=$1 typeset disk=${2#*$DEV_DSKDIR/} typeset state=$3 typeset timeout=${4:-60} typeset -i i=0 while [[ $i -lt $timeout ]]; do if check_hotspare_state $pool $disk $state; then return 0 fi i=$((i+1)) sleep 1 done return 1 } # # Verify a given vdev disk is inuse or avail # # Return 0 is pool/disk matches expected state, 1 otherwise # function check_vdev_state # pool disk state{online,offline,unavail,removed} { typeset pool=$1 typeset disk=${2#*$DEV_DSKDIR/} typeset state=$3 cur_state=$(get_device_state $pool $disk) [ $state = $cur_state ] } # # Wait until a vdev transitions to a given state or times out. # # Return 0 when pool/disk matches expected state, 1 on timeout. # function wait_vdev_state # pool disk state timeout { typeset pool=$1 typeset disk=${2#*$DEV_DSKDIR/} typeset state=$3 typeset timeout=${4:-60} typeset -i i=0 while [[ $i -lt $timeout ]]; do if check_vdev_state $pool $disk $state; then return 0 fi i=$((i+1)) sleep 1 done return 1 } # # Check the output of 'zpool status -v ', # and to see if the content of contain the specified. # # Return 0 is contain, 1 otherwise # function check_pool_status # pool token keyword { typeset pool=$1 typeset token=$2 typeset keyword=$3 typeset verbose=${4:-false} scan=$(zpool status -v "$pool" 2>/dev/null | awk -v token="$token:" '$1==token') if [[ $verbose == true ]]; then log_note $scan fi echo $scan | grep -qi "$keyword" } # # The following functions are instance of check_pool_status() # is_pool_resilvering - to check if the pool resilver is in progress # is_pool_resilvered - to check if the pool resilver is completed # is_pool_scrubbing - to check if the pool scrub is in progress # is_pool_scrubbed - to check if the pool scrub is completed # is_pool_scrub_stopped - to check if the pool scrub is stopped # is_pool_scrub_paused - to check if the pool scrub has paused # is_pool_removing - to check if the pool removing is a vdev # is_pool_removed - to check if the pool remove is completed # is_pool_discarding - to check if the pool checkpoint is being discarded # is_pool_replacing - to check if the pool is performing a replacement # function is_pool_resilvering #pool { check_pool_status "$1" "scan" \ "resilver[ ()0-9A-Za-z:_-]* in progress since" $2 } function is_pool_resilvered #pool { check_pool_status "$1" "scan" "resilvered " $2 } function is_pool_scrubbing #pool { check_pool_status "$1" "scan" "scrub in progress since " $2 } function is_pool_error_scrubbing #pool { check_pool_status "$1" "scrub" "error scrub in progress since " $2 return $? } function is_pool_scrubbed #pool { check_pool_status "$1" "scan" "scrub repaired" $2 } function is_pool_scrub_stopped #pool { check_pool_status "$1" "scan" "scrub canceled" $2 } function is_pool_error_scrub_stopped #pool { check_pool_status "$1" "scrub" "error scrub canceled on " $2 return $? } function is_pool_scrub_paused #pool { check_pool_status "$1" "scan" "scrub paused since " $2 } function is_pool_error_scrub_paused #pool { check_pool_status "$1" "scrub" "error scrub paused since " $2 return $? } function is_pool_removing #pool { check_pool_status "$1" "remove" "in progress since " } function is_pool_removed #pool { check_pool_status "$1" "remove" "completed on" } function is_pool_discarding #pool { check_pool_status "$1" "checkpoint" "discarding" } function is_pool_replacing #pool { zpool status "$1" | grep -qE 'replacing-[0-9]+' } function wait_for_degraded { typeset pool=$1 typeset timeout=${2:-30} typeset t0=$SECONDS while :; do [[ $(get_pool_prop health $pool) == "DEGRADED" ]] && break log_note "$pool is not yet degraded." sleep 1 if ((SECONDS - t0 > $timeout)); then log_note "$pool not degraded after $timeout seconds." return 1 fi done return 0 } # # Use create_pool()/destroy_pool() to clean up the information in # in the given disk to avoid slice overlapping. # function cleanup_devices #vdevs { typeset pool="foopool$$" for vdev in $@; do zero_partitions $vdev done poolexists $pool && destroy_pool $pool create_pool $pool $@ destroy_pool $pool return 0 } #/** # A function to find and locate free disks on a system or from given # disks as the parameter. It works by locating disks that are in use # as swap devices and dump devices, and also disks listed in /etc/vfstab # # $@ given disks to find which are free, default is all disks in # the test system # # @return a string containing the list of available disks #*/ function find_disks { # Trust provided list, no attempt is made to locate unused devices. if is_linux || is_freebsd; then echo "$@" return fi sfi=/tmp/swaplist.$$ dmpi=/tmp/dumpdev.$$ max_finddisksnum=${MAX_FINDDISKSNUM:-6} swap -l > $sfi dumpadm > $dmpi 2>/dev/null disks=${@:-$(echo "" | format -e 2>/dev/null | awk ' BEGIN { FS="."; } /^Specify disk/{ searchdisks=0; } { if (searchdisks && $2 !~ "^$"){ split($2,arr," "); print arr[1]; } } /^AVAILABLE DISK SELECTIONS:/{ searchdisks=1; } ')} unused="" for disk in $disks; do # Check for mounted grep -q "${disk}[sp]" /etc/mnttab && continue # Check for swap grep -q "${disk}[sp]" $sfi && continue # check for dump device grep -q "${disk}[sp]" $dmpi && continue # check to see if this disk hasn't been explicitly excluded # by a user-set environment variable echo "${ZFS_HOST_DEVICES_IGNORE}" | grep -q "${disk}" && continue unused_candidates="$unused_candidates $disk" done rm $sfi $dmpi # now just check to see if those disks do actually exist # by looking for a device pointing to the first slice in # each case. limit the number to max_finddisksnum count=0 for disk in $unused_candidates; do if is_disk_device $DEV_DSKDIR/${disk}s0 && \ [ $count -lt $max_finddisksnum ]; then unused="$unused $disk" # do not impose limit if $@ is provided [[ -z $@ ]] && ((count = count + 1)) fi done # finally, return our disk list echo $unused } function add_user_freebsd # { typeset group=$1 typeset user=$2 typeset basedir=$3 # Check to see if the user exists. if id $user > /dev/null 2>&1; then return 0 fi # Assign 1000 as the base uid typeset -i uid=1000 while true; do pw useradd -u $uid -g $group -d $basedir/$user -m -n $user case $? in 0) break ;; # The uid is not unique 65) ((uid += 1)) ;; *) return 1 ;; esac if [[ $uid == 65000 ]]; then log_fail "No user id available under 65000 for $user" fi done # Silence MOTD touch $basedir/$user/.hushlogin return 0 } # # Delete the specified user. # # $1 login name # function del_user_freebsd # { typeset user=$1 if id $user > /dev/null 2>&1; then log_must pw userdel $user fi return 0 } # # Select valid gid and create specified group. # # $1 group name # function add_group_freebsd # { typeset group=$1 # See if the group already exists. if pw groupshow $group >/dev/null 2>&1; then return 0 fi # Assign 1000 as the base gid typeset -i gid=1000 while true; do pw groupadd -g $gid -n $group > /dev/null 2>&1 case $? in 0) return 0 ;; # The gid is not unique 65) ((gid += 1)) ;; *) return 1 ;; esac if [[ $gid == 65000 ]]; then log_fail "No user id available under 65000 for $group" fi done } # # Delete the specified group. # # $1 group name # function del_group_freebsd # { typeset group=$1 pw groupdel -n $group > /dev/null 2>&1 case $? in # Group does not exist, or was deleted successfully. 0|6|65) return 0 ;; # Name already exists as a group name 9) log_must pw groupdel $group ;; *) return 1 ;; esac return 0 } function add_user_illumos # { typeset group=$1 typeset user=$2 typeset basedir=$3 log_must useradd -g $group -d $basedir/$user -m $user return 0 } function del_user_illumos # { typeset user=$1 if id $user > /dev/null 2>&1; then log_must_retry "currently used" 6 userdel $user fi return 0 } function add_group_illumos # { typeset group=$1 typeset -i gid=100 while true; do groupadd -g $gid $group > /dev/null 2>&1 case $? in 0) return 0 ;; # The gid is not unique 4) ((gid += 1)) ;; *) return 1 ;; esac done } function del_group_illumos # { typeset group=$1 groupmod -n $grp $grp > /dev/null 2>&1 case $? in # Group does not exist. 6) return 0 ;; # Name already exists as a group name 9) log_must groupdel $grp ;; *) return 1 ;; esac } function add_user_linux # { typeset group=$1 typeset user=$2 typeset basedir=$3 log_must useradd -g $group -d $basedir/$user -m $user # Add new users to the same group and the command line utils. # This allows them to be run out of the original users home # directory as long as it permissioned to be group readable. cmd_group=$(stat --format="%G" $(command -v zfs)) log_must usermod -a -G $cmd_group $user return 0 } function del_user_linux # { typeset user=$1 if id $user > /dev/null 2>&1; then log_must_retry "currently used" 6 userdel $user fi } function add_group_linux # { typeset group=$1 # Assign 100 as the base gid, a larger value is selected for # Linux because for many distributions 1000 and under are reserved. while true; do groupadd $group > /dev/null 2>&1 case $? in 0) return 0 ;; *) return 1 ;; esac done } function del_group_linux # { typeset group=$1 getent group $group > /dev/null 2>&1 case $? in # Group does not exist. 2) return 0 ;; # Name already exists as a group name 0) log_must groupdel $group ;; *) return 1 ;; esac return 0 } # # Add specified user to specified group # # $1 group name # $2 user name # $3 base of the homedir (optional) # function add_user # { typeset group=$1 typeset user=$2 typeset basedir=${3:-"/var/tmp"} if ((${#group} == 0 || ${#user} == 0)); then log_fail "group name or user name are not defined." fi case "$UNAME" in FreeBSD) add_user_freebsd "$group" "$user" "$basedir" ;; Linux) add_user_linux "$group" "$user" "$basedir" ;; *) add_user_illumos "$group" "$user" "$basedir" ;; esac return 0 } # # Delete the specified user. # # $1 login name # $2 base of the homedir (optional) # function del_user # { typeset user=$1 typeset basedir=${2:-"/var/tmp"} if ((${#user} == 0)); then log_fail "login name is necessary." fi case "$UNAME" in FreeBSD) del_user_freebsd "$user" ;; Linux) del_user_linux "$user" ;; *) del_user_illumos "$user" ;; esac [[ -d $basedir/$user ]] && rm -fr $basedir/$user return 0 } # # Select valid gid and create specified group. # # $1 group name # function add_group # { typeset group=$1 if ((${#group} == 0)); then log_fail "group name is necessary." fi case "$UNAME" in FreeBSD) add_group_freebsd "$group" ;; Linux) add_group_linux "$group" ;; *) add_group_illumos "$group" ;; esac return 0 } # # Delete the specified group. # # $1 group name # function del_group # { typeset group=$1 if ((${#group} == 0)); then log_fail "group name is necessary." fi case "$UNAME" in FreeBSD) del_group_freebsd "$group" ;; Linux) del_group_linux "$group" ;; *) del_group_illumos "$group" ;; esac return 0 } # # This function will return true if it's safe to destroy the pool passed # as argument 1. It checks for pools based on zvols and files, and also # files contained in a pool that may have a different mountpoint. # function safe_to_destroy_pool { # $1 the pool name typeset pool="" typeset DONT_DESTROY="" # We check that by deleting the $1 pool, we're not # going to pull the rug out from other pools. Do this # by looking at all other pools, ensuring that they # aren't built from files or zvols contained in this pool. for pool in $(zpool list -H -o name) do ALTMOUNTPOOL="" # this is a list of the top-level directories in each of the # files that make up the path to the files the pool is based on FILEPOOL=$(zpool status -v $pool | awk -v pool="/$1/" '$0 ~ pool {print $1}') # this is a list of the zvols that make up the pool ZVOLPOOL=$(zpool status -v $pool | awk -v zvols="$ZVOL_DEVDIR/$1$" '$0 ~ zvols {print $1}') # also want to determine if it's a file-based pool using an # alternate mountpoint... POOL_FILE_DIRS=$(zpool status -v $pool | \ awk '/\// {print $1}' | \ awk -F/ '!/dev/ {print $2}') for pooldir in $POOL_FILE_DIRS do OUTPUT=$(zfs list -H -r -o mountpoint $1 | \ awk -v pd="${pooldir}$" '$0 ~ pd {print $1}') ALTMOUNTPOOL="${ALTMOUNTPOOL}${OUTPUT}" done if [ ! -z "$ZVOLPOOL" ] then DONT_DESTROY="true" log_note "Pool $pool is built from $ZVOLPOOL on $1" fi if [ ! -z "$FILEPOOL" ] then DONT_DESTROY="true" log_note "Pool $pool is built from $FILEPOOL on $1" fi if [ ! -z "$ALTMOUNTPOOL" ] then DONT_DESTROY="true" log_note "Pool $pool is built from $ALTMOUNTPOOL on $1" fi done if [ -z "${DONT_DESTROY}" ] then return 0 else log_note "Warning: it is not safe to destroy $1!" return 1 fi } # # Verify zfs operation with -p option work as expected # $1 operation, value could be create, clone or rename # $2 dataset type, value could be fs or vol # $3 dataset name # $4 new dataset name # function verify_opt_p_ops { typeset ops=$1 typeset datatype=$2 typeset dataset=$3 typeset newdataset=$4 if [[ $datatype != "fs" && $datatype != "vol" ]]; then log_fail "$datatype is not supported." fi # check parameters accordingly case $ops in create) newdataset=$dataset dataset="" if [[ $datatype == "vol" ]]; then ops="create -V $VOLSIZE" fi ;; clone) if [[ -z $newdataset ]]; then log_fail "newdataset should not be empty" \ "when ops is $ops." fi log_must datasetexists $dataset log_must snapexists $dataset ;; rename) if [[ -z $newdataset ]]; then log_fail "newdataset should not be empty" \ "when ops is $ops." fi log_must datasetexists $dataset ;; *) log_fail "$ops is not supported." ;; esac # make sure the upper level filesystem does not exist destroy_dataset "${newdataset%/*}" "-rRf" # without -p option, operation will fail log_mustnot zfs $ops $dataset $newdataset log_mustnot datasetexists $newdataset ${newdataset%/*} # with -p option, operation should succeed log_must zfs $ops -p $dataset $newdataset block_device_wait if ! datasetexists $newdataset ; then log_fail "-p option does not work for $ops" fi # when $ops is create or clone, redo the operation still return zero if [[ $ops != "rename" ]]; then log_must zfs $ops -p $dataset $newdataset fi return 0 } # # Get configuration of pool # $1 pool name # $2 config name # function get_config { typeset pool=$1 typeset config=$2 if ! poolexists "$pool" ; then return 1 fi if [ "$(get_pool_prop cachefile "$pool")" = "none" ]; then zdb -e $pool else zdb -C $pool fi | awk -F: -v cfg="$config:" '$0 ~ cfg {sub(/^'\''/, $2); sub(/'\''$/, $2); print $2}' } # # Privated function. Random select one of items from arguments. # # $1 count # $2-n string # function _random_get { typeset cnt=$1 shift typeset str="$@" typeset -i ind ((ind = RANDOM % cnt + 1)) echo "$str" | cut -f $ind -d ' ' } # # Random select one of item from arguments which include NONE string # function random_get_with_non { typeset -i cnt=$# ((cnt =+ 1)) _random_get "$cnt" "$@" } # # Random select one of item from arguments which doesn't include NONE string # function random_get { _random_get "$#" "$@" } # # The function will generate a dataset name with specific length # $1, the length of the name # $2, the base string to construct the name # function gen_dataset_name { typeset -i len=$1 typeset basestr="$2" typeset -i baselen=${#basestr} typeset -i iter=0 typeset l_name="" if ((len % baselen == 0)); then ((iter = len / baselen)) else ((iter = len / baselen + 1)) fi while ((iter > 0)); do l_name="${l_name}$basestr" ((iter -= 1)) done echo $l_name } # # Get cksum tuple of dataset # $1 dataset name # # sample zdb output: # Dataset data/test [ZPL], ID 355, cr_txg 2413856, 31.0K, 7 objects, rootbp # DVA[0]=<0:803046400:200> DVA[1]=<0:81199000:200> [L0 DMU objset] fletcher4 # lzjb LE contiguous unique double size=800L/200P birth=2413856L/2413856P # fill=7 cksum=11ce125712:643a9c18ee2:125e25238fca0:254a3f74b59744 function datasetcksum { typeset cksum sync sync_all_pools zdb -vvv $1 | awk -F= -v ds="^Dataset $1 "'\\[' '$0 ~ ds && /cksum/ {print $7}' } # # Get the given disk/slice state from the specific field of the pool # function get_device_state #pool disk field("", "spares","logs") { typeset pool=$1 typeset disk=${2#$DEV_DSKDIR/} typeset field=${3:-$pool} zpool status -v "$pool" 2>/dev/null | \ awk -v device=$disk -v pool=$pool -v field=$field \ 'BEGIN {startconfig=0; startfield=0; } /config:/ {startconfig=1} (startconfig==1) && ($1==field) {startfield=1; next;} (startfield==1) && ($1==device) {print $2; exit;} (startfield==1) && ($1==field || $1 ~ "^spares$" || $1 ~ "^logs$") {startfield=0}' } # # get the root filesystem name if it's zfsroot system. # # return: root filesystem name function get_rootfs { typeset rootfs="" if is_freebsd; then rootfs=$(mount -p | awk '$2 == "/" && $3 == "zfs" {print $1}') elif ! is_linux; then rootfs=$(awk '$2 == "/" && $3 == "zfs" {print $1}' \ /etc/mnttab) fi if [[ -z "$rootfs" ]]; then log_fail "Can not get rootfs" fi if datasetexists $rootfs; then echo $rootfs else log_fail "This is not a zfsroot system." fi } # # get the rootfs's pool name # return: # rootpool name # function get_rootpool { typeset rootfs=$(get_rootfs) echo ${rootfs%%/*} } # # To verify if the require numbers of disks is given # function verify_disk_count { typeset -i min=${2:-1} typeset -i count=$(echo "$1" | wc -w) if ((count < min)); then log_untested "A minimum of $min disks is required to run." \ " You specified $count disk(s)" fi } function ds_is_volume { typeset type=$(get_prop type $1) [ $type = "volume" ] } function ds_is_filesystem { typeset type=$(get_prop type $1) [ $type = "filesystem" ] } # # Check if Trusted Extensions are installed and enabled # function is_te_enabled { svcs -H -o state labeld 2>/dev/null | grep -q "enabled" } # Return the number of CPUs (cross-platform) function get_num_cpus { if is_linux ; then grep -c '^processor' /proc/cpuinfo elif is_freebsd; then sysctl -n kern.smp.cpus else psrinfo | wc -l fi } # Utility function to determine if a system has multiple cpus. function is_mp { [[ $(get_num_cpus) -gt 1 ]] } function get_cpu_freq { if is_linux; then lscpu | awk '/CPU MHz/ { print $3 }' elif is_freebsd; then sysctl -n hw.clockrate else psrinfo -v 0 | awk '/processor operates at/ {print $6}' fi } # Run the given command as the user provided. function user_run { typeset user=$1 shift log_note "user: $user" log_note "cmd: $*" typeset out=$TEST_BASE_DIR/out typeset err=$TEST_BASE_DIR/err sudo -Eu $user env PATH="$PATH" ksh <<<"$*" >$out 2>$err typeset res=$? log_note "out: $(<$out)" log_note "err: $(<$err)" return $res } # # Check if the pool contains the specified vdevs # # $1 pool # $2..n ... # # Return 0 if the vdevs are contained in the pool, 1 if any of the specified # vdevs is not in the pool, and 2 if pool name is missing. # function vdevs_in_pool { typeset pool=$1 typeset vdev if [[ -z $pool ]]; then log_note "Missing pool name." return 2 fi shift # We could use 'zpool list' to only get the vdevs of the pool but we # can't reference a mirror/raidz vdev using its ID (i.e mirror-0), # therefore we use the 'zpool status' output. typeset tmpfile=$(mktemp) zpool status -v "$pool" | grep -A 1000 "config:" >$tmpfile for vdev in "$@"; do grep -wq ${vdev##*/} $tmpfile || return 1 done rm -f $tmpfile return 0 } function get_max { typeset -l i max=$1 shift for i in "$@"; do max=$((max > i ? max : i)) done echo $max } # Write data that can be compressed into a directory function write_compressible { typeset dir=$1 typeset megs=$2 typeset nfiles=${3:-1} typeset bs=${4:-1024k} typeset fname=${5:-file} [[ -d $dir ]] || log_fail "No directory: $dir" # Under Linux fio is not currently used since its behavior can # differ significantly across versions. This includes missing # command line options and cases where the --buffer_compress_* # options fail to behave as expected. if is_linux; then typeset file_bytes=$(to_bytes $megs) typeset bs_bytes=4096 typeset blocks=$(($file_bytes / $bs_bytes)) for (( i = 0; i < $nfiles; i++ )); do truncate -s $file_bytes $dir/$fname.$i # Write every third block to get 66% compression. for (( j = 0; j < $blocks; j += 3 )); do dd if=/dev/urandom of=$dir/$fname.$i \ seek=$j bs=$bs_bytes count=1 \ conv=notrunc >/dev/null 2>&1 done done else command -v fio > /dev/null || log_unsupported "fio missing" log_must eval fio \ --name=job \ --fallocate=0 \ --minimal \ --randrepeat=0 \ --buffer_compress_percentage=66 \ --buffer_compress_chunk=4096 \ --directory="$dir" \ --numjobs="$nfiles" \ --nrfiles="$nfiles" \ --rw=write \ --bs="$bs" \ --filesize="$megs" \ "--filename_format='$fname.\$jobnum' >/dev/null" fi } function get_objnum { typeset pathname=$1 typeset objnum [[ -e $pathname ]] || log_fail "No such file or directory: $pathname" if is_freebsd; then objnum=$(stat -f "%i" $pathname) else objnum=$(stat -c %i $pathname) fi echo $objnum } # # Sync data to the pool # # $1 pool name # $2 boolean to force uberblock (and config including zpool cache file) update # function sync_pool #pool { typeset pool=${1:-$TESTPOOL} typeset force=${2:-false} if [[ $force == true ]]; then log_must zpool sync -f $pool else log_must zpool sync $pool fi return 0 } # # Sync all pools # # $1 boolean to force uberblock (and config including zpool cache file) update # function sync_all_pools # { typeset force=${1:-false} if [[ $force == true ]]; then log_must zpool sync -f else log_must zpool sync fi return 0 } # # Wait for zpool 'freeing' property drops to zero. # # $1 pool name # function wait_freeing #pool { typeset pool=${1:-$TESTPOOL} while true; do [[ "0" == "$(zpool list -Ho freeing $pool)" ]] && break log_must sleep 1 done } # # Wait for every device replace operation to complete # # $1 pool name # $2 timeout # function wait_replacing #pool timeout { typeset timeout=${2:-300} typeset pool=${1:-$TESTPOOL} for (( timer = 0; timer < $timeout; timer++ )); do is_pool_replacing $pool || break; sleep 1; done } # Wait for a pool to be scrubbed # # $1 pool name # $2 timeout # function wait_scrubbed #pool timeout { typeset timeout=${2:-300} typeset pool=${1:-$TESTPOOL} for (( timer = 0; timer < $timeout; timer++ )); do is_pool_scrubbed $pool && break; sleep 1; done } # Backup the zed.rc in our test directory so that we can edit it for our test. # # Returns: Backup file name. You will need to pass this to zed_rc_restore(). function zed_rc_backup { zedrc_backup="$(mktemp)" cp $ZEDLET_DIR/zed.rc $zedrc_backup echo $zedrc_backup } function zed_rc_restore { mv $1 $ZEDLET_DIR/zed.rc } # # Setup custom environment for the ZED. # # $@ Optional list of zedlets to run under zed. function zed_setup { if ! is_linux; then log_unsupported "No zed on $UNAME" fi if [[ ! -d $ZEDLET_DIR ]]; then log_must mkdir $ZEDLET_DIR fi if [[ ! -e $VDEVID_CONF ]]; then log_must touch $VDEVID_CONF fi if [[ -e $VDEVID_CONF_ETC ]]; then log_fail "Must not have $VDEVID_CONF_ETC file present on system" fi EXTRA_ZEDLETS=$@ # Create a symlink for /etc/zfs/vdev_id.conf file. log_must ln -s $VDEVID_CONF $VDEVID_CONF_ETC # Setup minimal ZED configuration. Individual test cases should # add additional ZEDLETs as needed for their specific test. log_must cp ${ZEDLET_ETC_DIR}/zed.rc $ZEDLET_DIR log_must cp ${ZEDLET_ETC_DIR}/zed-functions.sh $ZEDLET_DIR # Scripts must only be user writable. if [[ -n "$EXTRA_ZEDLETS" ]] ; then saved_umask=$(umask) log_must umask 0022 for i in $EXTRA_ZEDLETS ; do log_must cp ${ZEDLET_LIBEXEC_DIR}/$i $ZEDLET_DIR done log_must umask $saved_umask fi # Customize the zed.rc file to enable the full debug log. log_must sed -i '/\#ZED_DEBUG_LOG=.*/d' $ZEDLET_DIR/zed.rc echo "ZED_DEBUG_LOG=$ZED_DEBUG_LOG" >>$ZEDLET_DIR/zed.rc } # # Cleanup custom ZED environment. # # $@ Optional list of zedlets to remove from our test zed.d directory. function zed_cleanup { if ! is_linux; then return fi for extra_zedlet; do log_must rm -f ${ZEDLET_DIR}/$extra_zedlet done log_must rm -fd ${ZEDLET_DIR}/zed.rc ${ZEDLET_DIR}/zed-functions.sh ${ZEDLET_DIR}/all-syslog.sh ${ZEDLET_DIR}/all-debug.sh ${ZEDLET_DIR}/state \ $ZED_LOG $ZED_DEBUG_LOG $VDEVID_CONF_ETC $VDEVID_CONF \ $ZEDLET_DIR } # # Check if ZED is currently running; if so, returns PIDs # function zed_check { if ! is_linux; then return fi zedpids="$(pgrep -x zed)" zedpids2="$(pgrep -x lt-zed)" echo ${zedpids} ${zedpids2} } # # Check if ZED is currently running, if not start ZED. # function zed_start { if ! is_linux; then return fi # ZEDLET_DIR=/var/tmp/zed if [[ ! -d $ZEDLET_DIR ]]; then log_must mkdir $ZEDLET_DIR fi # Verify the ZED is not already running. zedpids=$(zed_check) if [ -n "$zedpids" ]; then # We never, ever, really want it to just keep going if zed # is already running - usually this implies our test cases # will break very strangely because whatever we wanted to # configure zed for won't be listening to our changes in the # tmpdir log_fail "ZED already running - ${zedpids}" else log_note "Starting ZED" # run ZED in the background and redirect foreground logging # output to $ZED_LOG. log_must truncate -s 0 $ZED_DEBUG_LOG log_must eval "zed -vF -d $ZEDLET_DIR -P $PATH" \ "-s $ZEDLET_DIR/state -j 1 2>$ZED_LOG &" fi return 0 } # # Kill ZED process # function zed_stop { if ! is_linux; then return "" fi log_note "Stopping ZED" while true; do zedpids=$(zed_check) [ ! -n "$zedpids" ] && break log_must kill $zedpids sleep 1 done return 0 } # # Drain all zevents # function zed_events_drain { while [ $(zpool events -H | wc -l) -ne 0 ]; do sleep 1 zpool events -c >/dev/null done } # Set a variable in zed.rc to something, un-commenting it in the process. # # $1 variable # $2 value function zed_rc_set { var="$1" val="$2" # Remove the line cmd="'/$var/d'" eval sed -i $cmd $ZEDLET_DIR/zed.rc # Add it at the end echo "$var=$val" >> $ZEDLET_DIR/zed.rc } # # Check is provided device is being active used as a swap device. # function is_swap_inuse { typeset device=$1 if [[ -z $device ]] ; then log_note "No device specified." return 1 fi case "$UNAME" in Linux) swapon -s | grep -wq $(readlink -f $device) ;; FreeBSD) swapctl -l | grep -wq $device ;; *) swap -l | grep -wq $device ;; esac } # # Setup a swap device using the provided device. # function swap_setup { typeset swapdev=$1 case "$UNAME" in Linux) log_must eval "mkswap $swapdev > /dev/null 2>&1" log_must swapon $swapdev ;; FreeBSD) log_must swapctl -a $swapdev ;; *) log_must swap -a $swapdev ;; esac return 0 } # # Cleanup a swap device on the provided device. # function swap_cleanup { typeset swapdev=$1 if is_swap_inuse $swapdev; then if is_linux; then log_must swapoff $swapdev elif is_freebsd; then log_must swapoff $swapdev else log_must swap -d $swapdev fi fi return 0 } # # Set a global system tunable (64-bit value) # # $1 tunable name (use a NAME defined in tunables.cfg) # $2 tunable values # function set_tunable64 { set_tunable_impl "$1" "$2" Z } # # Set a global system tunable (32-bit value) # # $1 tunable name (use a NAME defined in tunables.cfg) # $2 tunable values # function set_tunable32 { set_tunable_impl "$1" "$2" W } function set_tunable_impl { typeset name="$1" typeset value="$2" typeset mdb_cmd="$3" eval "typeset tunable=\$$name" case "$tunable" in UNSUPPORTED) log_unsupported "Tunable '$name' is unsupported on $UNAME" ;; "") log_fail "Tunable '$name' must be added to tunables.cfg" ;; *) ;; esac [[ -z "$value" ]] && return 1 [[ -z "$mdb_cmd" ]] && return 1 case "$UNAME" in Linux) typeset zfs_tunables="/sys/module/zfs/parameters" echo "$value" >"$zfs_tunables/$tunable" ;; FreeBSD) sysctl vfs.zfs.$tunable=$value ;; SunOS) echo "${tunable}/${mdb_cmd}0t${value}" | mdb -kw ;; esac } # # Get a global system tunable # # $1 tunable name (use a NAME defined in tunables.cfg) # function get_tunable { get_tunable_impl "$1" } function get_tunable_impl { typeset name="$1" typeset module="${2:-zfs}" typeset check_only="$3" eval "typeset tunable=\$$name" case "$tunable" in UNSUPPORTED) if [ -z "$check_only" ] ; then log_unsupported "Tunable '$name' is unsupported on $UNAME" else return 1 fi ;; "") if [ -z "$check_only" ] ; then log_fail "Tunable '$name' must be added to tunables.cfg" else return 1 fi ;; *) ;; esac case "$UNAME" in Linux) typeset zfs_tunables="/sys/module/$module/parameters" cat $zfs_tunables/$tunable ;; FreeBSD) sysctl -n vfs.zfs.$tunable ;; SunOS) [[ "$module" -eq "zfs" ]] || return 1 ;; esac } # Does a tunable exist? # # $1: Tunable name function tunable_exists { get_tunable_impl $1 "zfs" 1 } # # Compute MD5 digest for given file or stdin if no file given. # Note: file path must not contain spaces # function md5digest { typeset file=$1 case "$UNAME" in FreeBSD) md5 -q $file ;; *) typeset sum _ read -r sum _ < <(md5sum -b $file) echo $sum ;; esac } # # Compute SHA256 digest for given file or stdin if no file given. # Note: file path must not contain spaces # function sha256digest { typeset file=$1 case "$UNAME" in FreeBSD) sha256 -q $file ;; *) typeset sum _ read -r sum _ < <(sha256sum -b $file) echo $sum ;; esac } function new_fs # { case "$UNAME" in FreeBSD) newfs "$@" ;; *) echo y | newfs -v "$@" ;; esac } function stat_size # { typeset path=$1 case "$UNAME" in FreeBSD) stat -f %z "$path" ;; *) stat -c %s "$path" ;; esac } function stat_mtime # { typeset path=$1 case "$UNAME" in FreeBSD) stat -f %m "$path" ;; *) stat -c %Y "$path" ;; esac } function stat_ctime # { typeset path=$1 case "$UNAME" in FreeBSD) stat -f %c "$path" ;; *) stat -c %Z "$path" ;; esac } function stat_crtime # { typeset path=$1 case "$UNAME" in FreeBSD) stat -f %B "$path" ;; *) stat -c %W "$path" ;; esac } function stat_generation # { typeset path=$1 case "$UNAME" in Linux) getversion "${path}" ;; *) stat -f %v "${path}" ;; esac } # Run a command as if it was being run in a TTY. # # Usage: # # faketty command # function faketty { if is_freebsd; then script -q /dev/null env "$@" else script --return --quiet -c "$*" /dev/null fi } # # Produce a random permutation of the integers in a given range (inclusive). # function range_shuffle # begin end { typeset -i begin=$1 typeset -i end=$2 seq ${begin} ${end} | sort -R } # # Cross-platform xattr helpers # function get_xattr # name path { typeset name=$1 typeset path=$2 case "$UNAME" in FreeBSD) getextattr -qq user "${name}" "${path}" ;; *) attr -qg "${name}" "${path}" ;; esac } function set_xattr # name value path { typeset name=$1 typeset value=$2 typeset path=$3 case "$UNAME" in FreeBSD) setextattr user "${name}" "${value}" "${path}" ;; *) attr -qs "${name}" -V "${value}" "${path}" ;; esac } function set_xattr_stdin # name value { typeset name=$1 typeset path=$2 case "$UNAME" in FreeBSD) setextattr -i user "${name}" "${path}" ;; *) attr -qs "${name}" "${path}" ;; esac } function rm_xattr # name path { typeset name=$1 typeset path=$2 case "$UNAME" in FreeBSD) rmextattr -q user "${name}" "${path}" ;; *) attr -qr "${name}" "${path}" ;; esac } function ls_xattr # path { typeset path=$1 case "$UNAME" in FreeBSD) lsextattr -qq user "${path}" ;; *) attr -ql "${path}" ;; esac } function kstat # stat flags? { typeset stat=$1 typeset flags=${2-"-n"} case "$UNAME" in FreeBSD) sysctl $flags kstat.zfs.misc.$stat ;; Linux) cat "/proc/spl/kstat/zfs/$stat" 2>/dev/null ;; *) false ;; esac } function get_arcstat # stat { typeset stat=$1 case "$UNAME" in FreeBSD) kstat arcstats.$stat ;; Linux) kstat arcstats | awk "/$stat/"' { print $3 }' ;; *) false ;; esac } function punch_hole # offset length file { typeset offset=$1 typeset length=$2 typeset file=$3 case "$UNAME" in FreeBSD) truncate -d -o $offset -l $length "$file" ;; Linux) fallocate --punch-hole --offset $offset --length $length "$file" ;; *) false ;; esac } function zero_range # offset length file { typeset offset=$1 typeset length=$2 typeset file=$3 case "$UNAME" in Linux) fallocate --zero-range --offset $offset --length $length "$file" ;; *) false ;; esac } # # Wait for the specified arcstat to reach non-zero quiescence. # If echo is 1 echo the value after reaching quiescence, otherwise # if echo is 0 print the arcstat we are waiting on. # function arcstat_quiescence # stat echo { typeset stat=$1 typeset echo=$2 typeset do_once=true if [[ $echo -eq 0 ]]; then echo "Waiting for arcstat $1 quiescence." fi while $do_once || [ $stat1 -ne $stat2 ] || [ $stat2 -eq 0 ]; do typeset stat1=$(get_arcstat $stat) - sleep 2 + sleep 0.5 typeset stat2=$(get_arcstat $stat) do_once=false done if [[ $echo -eq 1 ]]; then echo $stat2 fi } function arcstat_quiescence_noecho # stat { typeset stat=$1 arcstat_quiescence $stat 0 } function arcstat_quiescence_echo # stat { typeset stat=$1 arcstat_quiescence $stat 1 } # # Given an array of pids, wait until all processes # have completed and check their return status. # function wait_for_children #children { rv=0 children=("$@") for child in "${children[@]}" do child_exit=0 wait ${child} || child_exit=$? if [ $child_exit -ne 0 ]; then echo "child ${child} failed with ${child_exit}" rv=1 fi done return $rv } # # Compare two directory trees recursively in a manner similar to diff(1), but # using rsync. If there are any discrepancies, a summary of the differences are # output and a non-zero error is returned. # # If you're comparing a directory after a ZIL replay, you should set # LIBTEST_DIFF_ZIL_REPLAY=1 or use replay_directory_diff which will cause # directory_diff to ignore mtime changes (the ZIL replay won't fix up mtime # information). # function directory_diff # dir_a dir_b { dir_a="$1" dir_b="$2" zil_replay="${LIBTEST_DIFF_ZIL_REPLAY:-0}" # If one of the directories doesn't exist, return 2. This is to match the # semantics of diff. if ! [ -d "$dir_a" -a -d "$dir_b" ]; then return 2 fi # Run rsync with --dry-run --itemize-changes to get something akin to diff # output, but rsync is far more thorough in detecting differences (diff # doesn't compare file metadata, and cannot handle special files). # # Also make sure to filter out non-user.* xattrs when comparing. On # SELinux-enabled systems the copied tree will probably have different # SELinux labels. args=("-nicaAHX" '--filter=-x! user.*' "--delete") # NOTE: Quite a few rsync builds do not support --crtimes which would be # necessary to verify that creation times are being maintained properly. # Unfortunately because of this we cannot use it unconditionally but we can # check if this rsync build supports it and use it then. This check is # based on the same check in the rsync test suite (testsuite/crtimes.test). # # We check ctimes even with zil_replay=1 because the ZIL does store # creation times and we should make sure they match (if the creation times # do not match there is a "c" entry in one of the columns). if rsync --version | grep -q "[, ] crtimes"; then args+=("--crtimes") else log_note "This rsync package does not support --crtimes (-N)." fi # If we are testing a ZIL replay, we need to ignore timestamp changes. # Unfortunately --no-times doesn't do what we want -- it will still tell # you if the timestamps don't match but rsync will set the timestamps to # the current time (leading to an itemised change entry). It's simpler to # just filter out those lines. if [ "$zil_replay" -eq 0 ]; then filter=("cat") else # Different rsync versions have different numbers of columns. So just # require that aside from the first two, all other columns must be # blank (literal ".") or a timestamp field ("[tT]"). filter=("grep" "-v" '^\..[.Tt]\+ ') fi diff="$(rsync "${args[@]}" "$dir_a/" "$dir_b/" | "${filter[@]}")" rv=0 if [ -n "$diff" ]; then echo "$diff" rv=1 fi return $rv } # # Compare two directory trees recursively, without checking whether the mtimes # match (creation times will be checked if the available rsync binary supports # it). This is necessary for ZIL replay checks (because the ZIL does not # contain mtimes and thus after a ZIL replay, mtimes won't match). # # This is shorthand for LIBTEST_DIFF_ZIL_REPLAY=1 directory_diff <...>. # function replay_directory_diff # dir_a dir_b { LIBTEST_DIFF_ZIL_REPLAY=1 directory_diff "$@" } # # Put coredumps into $1/core.{basename} # # Output must be saved and passed to pop_coredump_pattern on cleanup # function push_coredump_pattern # dir { ulimit -c unlimited case "$UNAME" in Linux) cat /proc/sys/kernel/core_pattern /proc/sys/kernel/core_uses_pid echo "$1/core.%e" >/proc/sys/kernel/core_pattern && echo 0 >/proc/sys/kernel/core_uses_pid ;; FreeBSD) sysctl -n kern.corefile sysctl kern.corefile="$1/core.%N" >/dev/null ;; *) # Nothing to output – set only for this shell coreadm -p "$1/core.%f" ;; esac } # # Put coredumps back into the default location # function pop_coredump_pattern { [ -s "$1" ] || return 0 case "$UNAME" in Linux) typeset pat pid { read -r pat; read -r pid; } < "$1" echo "$pat" >/proc/sys/kernel/core_pattern && echo "$pid" >/proc/sys/kernel/core_uses_pid ;; FreeBSD) sysctl kern.corefile="$(<"$1")" >/dev/null ;; esac } diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh index 6f7b9aff7c38..a9968723c3ca 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh @@ -1,112 +1,109 @@ #!/bin/ksh -p # # CDDL HEADER START # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # CDDL HEADER END # # # Copyright (c) 2020, George Amanakis. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/l2arc/l2arc.cfg # # DESCRIPTION: # Persistent L2ARC with an unencrypted ZFS file system succeeds # # STRATEGY: # 1. Create pool with a cache device. -# 2. Export and re-import pool without writing any data. -# 3. Create a random file in that pool and random read for 10 sec. -# 4. Export pool. -# 5. Read the amount of log blocks written from the header of the +# 2. Create a random file in that pool and random read for 10 sec. +# 3. Export pool. +# 4. Read the amount of log blocks written from the header of the # L2ARC device. -# 6. Import pool. -# 7. Read the amount of log blocks rebuilt in arcstats and compare to +# 5. Import pool. +# 6. Read the amount of log blocks rebuilt in arcstats and compare to # (5). -# 8. Check if the labels of the L2ARC device are intact. +# 7. Check if the labels of the L2ARC device are intact. # # * We can predict the minimum bytes of L2ARC restored if we subtract # from the effective size of the cache device the bytes l2arc_evict() # evicts: # l2: L2ARC device size - VDEV_LABEL_START_SIZE - l2ad_dev_hdr_asize # wr_sz: l2arc_write_max + l2arc_write_boost (worst case) # blk_overhead: wr_sz / SPA_MINBLOCKSIZE / (l2 / SPA_MAXBLOCKSIZE) * # sizeof (l2arc_log_blk_phys_t) # min restored size: l2 - (wr_sz + blk_overhead) # verify_runnable "global" command -v fio > /dev/null || log_unsupported "fio missing" log_assert "Persistent L2ARC with an unencrypted ZFS file system succeeds." function cleanup { if poolexists $TESTPOOL ; then destroy_pool $TESTPOOL fi log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE \ $rebuild_blocks_min_l2size } log_onexit cleanup # L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) typeset rebuild_blocks_min_l2size=$(get_tunable L2ARC_REBUILD_BLOCKS_MIN_L2SIZE) log_must set_tunable32 L2ARC_NOPREFETCH 0 log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE 0 typeset fill_mb=800 typeset cache_sz=$(( floor($fill_mb / 2) )) export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M log_must truncate -s ${cache_sz}M $VDEV_CACHE -log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE - -log_must zpool export $TESTPOOL -log_must zpool import -d $VDIR $TESTPOOL +log_must zpool create -f -o ashift=12 $TESTPOOL $VDEV +log_must zpool add $TESTPOOL cache $VDEV_CACHE log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio arcstat_quiescence_noecho l2_size log_must zpool export $TESTPOOL arcstat_quiescence_noecho l2_feeds typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | awk '/log_blk_count/ {print $2}') typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) log_must zpool import -d $VDIR $TESTPOOL arcstat_quiescence_noecho l2_size typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) log_must test $l2_dh_log_blk -gt 0 log_must zpool offline $TESTPOOL $VDEV_CACHE arcstat_quiescence_noecho l2_size log_must zdb -lllq $VDEV_CACHE log_must zpool destroy -f $TESTPOOL log_pass "Persistent L2ARC with an unencrypted ZFS file system succeeds." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh index 3cea334495d9..f238c361134f 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh @@ -1,193 +1,197 @@ #!/bin/ksh # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2019, Lawrence Livermore National Security, LLC. # Copyright (c) 2021, George Amanakis. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/include/properties.shlib . $STF_SUITE/tests/functional/rsend/rsend.kshlib # # Description: # Verify encrypted raw sending to pools with greater ashift succeeds. # # Strategy: # 1) Create a set of files each containing some file data in an # encrypted filesystem. # 2) Snapshot and raw send these files to a pool with greater ashift # 3) Verify that all the xattrs (and thus the spill block) were # preserved when receiving the incremental stream. # 4) Repeat the test for a non-encrypted filesystem using raw send # verify_runnable "both" log_assert "Verify raw sending to pools with greater ashift succeeds" +if is_freebsd; then + log_unsupported "Runs too long on FreeBSD 14 (Issue #14961)" +fi + function cleanup { rm -f $BACKDIR/fs@* poolexists pool9 && destroy_pool pool9 poolexists pool12 && destroy_pool pool12 log_must rm -f $TESTDIR/vdev_a $TESTDIR/vdev_b } function xattr_test { log_must zfs set xattr=sa pool9/$1 log_must zfs set dnodesize=legacy pool9/$1 log_must zfs set recordsize=128k pool9/$1 rand_set_prop pool9/$1 compression "${compress_prop_vals[@]}" # Create 40 files each with a spill block containing xattrs. Each file # will be modified in a different way to validate the incremental receive. for i in {1..40}; do file="/pool9/$1/file$i" log_must mkfile 16384 $file for j in {1..20}; do log_must set_xattr "testattr$j" "$attrvalue" $file done done # Snapshot the pool and send it to the new dataset. log_must zfs snapshot pool9/$1@snap1 log_must eval "zfs send -w pool9/$1@snap1 >$BACKDIR/$1@snap1" log_must eval "zfs recv pool12/$1 < $BACKDIR/$1@snap1" # # Modify file[1-6]'s contents but not the spill blocks. # # file1 - Increase record size; single block # file2 - Increase record size; multiple blocks # file3 - Truncate file to zero size; single block # file4 - Truncate file to smaller size; single block # file5 - Truncate file to much larger size; add holes # file6 - Truncate file to embedded size; embedded data # log_must mkfile 32768 /pool9/$1/file1 log_must mkfile 1048576 /pool9/$1/file2 log_must truncate -s 0 /pool9/$1/file3 log_must truncate -s 8192 /pool9/$1/file4 log_must truncate -s 1073741824 /pool9/$1/file5 log_must truncate -s 50 /pool9/$1/file6 # # Modify file[11-16]'s contents and their spill blocks. # # file11 - Increase record size; single block # file12 - Increase record size; multiple blocks # file13 - Truncate file to zero size; single block # file14 - Truncate file to smaller size; single block # file15 - Truncate file to much larger size; add holes # file16 - Truncate file to embedded size; embedded data # log_must mkfile 32768 /pool9/$1/file11 log_must mkfile 1048576 /pool9/$1/file12 log_must truncate -s 0 /pool9/$1/file13 log_must truncate -s 8192 /pool9/$1/file14 log_must truncate -s 1073741824 /pool9/$1/file15 log_must truncate -s 50 /pool9/$1/file16 for i in {11..20}; do log_must rm_xattr testattr1 /pool9/$1/file$i done # # Modify file[21-26]'s contents and remove their spill blocks. # # file21 - Increase record size; single block # file22 - Increase record size; multiple blocks # file23 - Truncate file to zero size; single block # file24 - Truncate file to smaller size; single block # file25 - Truncate file to much larger size; add holes # file26 - Truncate file to embedded size; embedded data # log_must mkfile 32768 /pool9/$1/file21 log_must mkfile 1048576 /pool9/$1/file22 log_must truncate -s 0 /pool9/$1/file23 log_must truncate -s 8192 /pool9/$1/file24 log_must truncate -s 1073741824 /pool9/$1/file25 log_must truncate -s 50 /pool9/$1/file26 for i in {21..30}; do for j in {1..20}; do log_must rm_xattr testattr$j /pool9/$1/file$i done done # # Modify file[31-40]'s spill blocks but not the file contents. # for i in {31..40}; do file="/pool9/$1/file$i" log_must rm_xattr testattr$(((RANDOM % 20) + 1)) $file log_must set_xattr testattr$(((RANDOM % 20) + 1)) "$attrvalue" $file done # Snapshot the pool and send the incremental snapshot. log_must zfs snapshot pool9/$1@snap2 log_must eval "zfs send -w -i pool9/$1@snap1 pool9/$1@snap2 >$BACKDIR/$1@snap2" log_must eval "zfs recv pool12/$1 < $BACKDIR/$1@snap2" } attrvalue="abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz" log_onexit cleanup # Create pools truncate -s $MINVDEVSIZE $TESTDIR/vdev_a truncate -s $MINVDEVSIZE $TESTDIR/vdev_b log_must zpool create -f -o ashift=9 pool9 $TESTDIR/vdev_a log_must zpool create -f -o ashift=12 pool12 $TESTDIR/vdev_b # Create encrypted fs log_must eval "echo 'password' | zfs create -o encryption=on" \ "-o keyformat=passphrase -o keylocation=prompt " \ "pool9/encfs" # Run xattr tests for encrypted fs xattr_test encfs # Calculate the expected recursive checksum for source encrypted fs expected_cksum=$(recursive_cksum /pool9/encfs) # Mount target encrypted fs log_must eval "echo 'password' | zfs load-key pool12/encfs" log_must zfs mount pool12/encfs # Validate the received copy using the received recursive checksum actual_cksum=$(recursive_cksum /pool12/encfs) if [[ "$expected_cksum" != "$actual_cksum" ]]; then log_fail "Checksums differ ($expected_cksum != $actual_cksum)" fi # Perform the same test but without encryption (send -w) log_must zfs create pool9/fs # Run xattr tests for non-encrypted fs xattr_test fs # Calculate the expected recursive checksum for source non-encrypted fs expected_cksum=$(recursive_cksum /pool9/fs) # Validate the received copy using the received recursive checksum actual_cksum=$(recursive_cksum /pool12/fs) if [[ "$expected_cksum" != "$actual_cksum" ]]; then log_fail "Checksums differ ($expected_cksum != $actual_cksum)" fi log_pass "Verify raw sending to pools with greater ashift succeeds" diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index dc40483fe785..caf1b68f8728 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -1,1090 +1,1090 @@ /* * $FreeBSD$ */ /* zfs_config.h. Generated from zfs_config.h.in by configure. */ /* zfs_config.h.in. Generated from configure.ac by autoheader. */ /* Define to 1 if translation of program messages to the user's native language is requested. */ /* #undef ENABLE_NLS */ /* bio_end_io_t wants 1 arg */ /* #undef HAVE_1ARG_BIO_END_IO_T */ /* lookup_bdev() wants 1 arg */ /* #undef HAVE_1ARG_LOOKUP_BDEV */ /* submit_bio() wants 1 arg */ /* #undef HAVE_1ARG_SUBMIT_BIO */ /* bdi_setup_and_register() wants 2 args */ /* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 2 args */ /* #undef HAVE_2ARGS_VFS_GETATTR */ /* zlib_deflate_workspacesize() wants 2 args */ /* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */ /* bdi_setup_and_register() wants 3 args */ /* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 3 args */ /* #undef HAVE_3ARGS_VFS_GETATTR */ /* vfs_getattr wants 4 args */ /* #undef HAVE_4ARGS_VFS_GETATTR */ /* kernel has access_ok with 'type' parameter */ /* #undef HAVE_ACCESS_OK_TYPE */ /* posix_acl has refcount_t */ /* #undef HAVE_ACL_REFCOUNT */ /* add_disk() returns int */ /* #undef HAVE_ADD_DISK_RET */ /* Define if host toolchain supports AES */ #define HAVE_AES 1 /* Define if you have [rt] */ #define HAVE_AIO_H 1 #ifdef __amd64__ #ifndef RESCUE /* Define if host toolchain supports AVX */ #define HAVE_AVX 1 #endif /* Define if host toolchain supports AVX2 */ #define HAVE_AVX2 1 /* Define if host toolchain supports AVX512BW */ #define HAVE_AVX512BW 1 /* Define if host toolchain supports AVX512CD */ #define HAVE_AVX512CD 1 /* Define if host toolchain supports AVX512DQ */ #define HAVE_AVX512DQ 1 /* Define if host toolchain supports AVX512ER */ #define HAVE_AVX512ER 1 /* Define if host toolchain supports AVX512F */ #define HAVE_AVX512F 1 /* Define if host toolchain supports AVX512IFMA */ #define HAVE_AVX512IFMA 1 /* Define if host toolchain supports AVX512PF */ #define HAVE_AVX512PF 1 /* Define if host toolchain supports AVX512VBMI */ #define HAVE_AVX512VBMI 1 /* Define if host toolchain supports AVX512VL */ #define HAVE_AVX512VL 1 #endif /* bdevname() is available */ /* #undef HAVE_BDEVNAME */ /* bdev_check_media_change() exists */ /* #undef HAVE_BDEV_CHECK_MEDIA_CHANGE */ /* bdev_*_io_acct() available */ /* #undef HAVE_BDEV_IO_ACCT_63 */ /* bdev_*_io_acct() available */ /* #undef HAVE_BDEV_IO_ACCT_OLD */ /* bdev_kobj() exists */ /* #undef HAVE_BDEV_KOBJ */ /* bdev_max_discard_sectors() is available */ /* #undef HAVE_BDEV_MAX_DISCARD_SECTORS */ /* bdev_max_secure_erase_sectors() is available */ /* #undef HAVE_BDEV_MAX_SECURE_ERASE_SECTORS */ /* block_device_operations->submit_bio() returns void */ /* #undef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID */ /* bdev_whole() is available */ /* #undef HAVE_BDEV_WHOLE */ /* bio_alloc() takes 4 arguments */ /* #undef HAVE_BIO_ALLOC_4ARG */ /* bio->bi_bdev->bd_disk exists */ /* #undef HAVE_BIO_BDEV_DISK */ /* bio->bi_opf is defined */ /* #undef HAVE_BIO_BI_OPF */ /* bio->bi_status exists */ /* #undef HAVE_BIO_BI_STATUS */ /* bio has bi_iter */ /* #undef HAVE_BIO_BVEC_ITER */ /* bio_*_io_acct() available */ /* #undef HAVE_BIO_IO_ACCT */ /* bio_max_segs() is implemented */ /* #undef HAVE_BIO_MAX_SEGS */ /* bio_set_dev() is available */ /* #undef HAVE_BIO_SET_DEV */ /* bio_set_dev() GPL-only */ /* #undef HAVE_BIO_SET_DEV_GPL_ONLY */ /* bio_set_dev() is a macro */ /* #undef HAVE_BIO_SET_DEV_MACRO */ /* bio_set_op_attrs is available */ /* #undef HAVE_BIO_SET_OP_ATTRS */ /* blkdev_get_by_path() handles ERESTARTSYS */ /* #undef HAVE_BLKDEV_GET_ERESTARTSYS */ /* blkdev_issue_discard() is available */ /* #undef HAVE_BLKDEV_ISSUE_DISCARD */ /* blkdev_issue_secure_erase() is available */ /* #undef HAVE_BLKDEV_ISSUE_SECURE_ERASE */ /* blkdev_reread_part() exists */ /* #undef HAVE_BLKDEV_REREAD_PART */ /* blkg_tryget() is available */ /* #undef HAVE_BLKG_TRYGET */ /* blkg_tryget() GPL-only */ /* #undef HAVE_BLKG_TRYGET_GPL_ONLY */ /* blk_alloc_disk() exists */ /* #undef HAVE_BLK_ALLOC_DISK */ /* blk_alloc_queue() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */ /* blk_alloc_queue_rh() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH */ /* blk_cleanup_disk() exists */ /* #undef HAVE_BLK_CLEANUP_DISK */ /* block multiqueue is available */ /* #undef HAVE_BLK_MQ */ /* blk queue backing_dev_info is dynamic */ /* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */ /* blk_queue_discard() is available */ /* #undef HAVE_BLK_QUEUE_DISCARD */ /* blk_queue_flag_clear() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */ /* blk_queue_flag_set() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_SET */ /* blk_queue_flush() is available */ /* #undef HAVE_BLK_QUEUE_FLUSH */ /* blk_queue_flush() is GPL-only */ /* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */ /* blk_queue_secdiscard() is available */ /* #undef HAVE_BLK_QUEUE_SECDISCARD */ /* blk_queue_secure_erase() is available */ /* #undef HAVE_BLK_QUEUE_SECURE_ERASE */ /* blk_queue_update_readahead() exists */ /* #undef HAVE_BLK_QUEUE_UPDATE_READAHEAD */ /* blk_queue_write_cache() exists */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE */ /* blk_queue_write_cache() is GPL-only */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */ /* Define if revalidate_disk() in block_device_operations */ /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYCURRENT */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */ /* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in the CoreFoundation framework. */ /* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */ /* check_disk_change() exists */ /* #undef HAVE_CHECK_DISK_CHANGE */ /* clear_inode() is available */ /* #undef HAVE_CLEAR_INODE */ /* dentry uses const struct dentry_operations */ /* #undef HAVE_CONST_DENTRY_OPERATIONS */ /* copy_from_iter() is available */ /* #undef HAVE_COPY_FROM_ITER */ /* copy_to_iter() is available */ /* #undef HAVE_COPY_TO_ITER */ /* cpu_has_feature() is GPL-only */ /* #undef HAVE_CPU_HAS_FEATURE_GPL_ONLY */ /* yes */ /* #undef HAVE_CPU_HOTPLUG */ /* current_time() exists */ /* #undef HAVE_CURRENT_TIME */ /* Define if the GNU dcgettext() function is already present or preinstalled. */ /* #undef HAVE_DCGETTEXT */ /* DECLARE_EVENT_CLASS() is available */ /* #undef HAVE_DECLARE_EVENT_CLASS */ /* dentry aliases are in d_u member */ /* #undef HAVE_DENTRY_D_U_ALIASES */ /* dequeue_signal() takes 4 arguments */ /* #undef HAVE_DEQUEUE_SIGNAL_4ARG */ /* lookup_bdev() wants dev_t arg */ /* #undef HAVE_DEVT_LOOKUP_BDEV */ /* sops->dirty_inode() wants flags */ /* #undef HAVE_DIRTY_INODE_WITH_FLAGS */ /* disk_*_io_acct() available */ /* #undef HAVE_DISK_IO_ACCT */ /* disk_update_readahead() exists */ /* #undef HAVE_DISK_UPDATE_READAHEAD */ /* Define to 1 if you have the header file. */ #define HAVE_DLFCN_H 1 /* d_make_root() is available */ /* #undef HAVE_D_MAKE_ROOT */ /* d_prune_aliases() is available */ /* #undef HAVE_D_PRUNE_ALIASES */ /* dops->d_revalidate() operation takes nameidata */ /* #undef HAVE_D_REVALIDATE_NAMEIDATA */ /* eops->encode_fh() wants child and parent inodes */ /* #undef HAVE_ENCODE_FH_WITH_INODE */ /* sops->evict_inode() exists */ /* #undef HAVE_EVICT_INODE */ /* FALLOC_FL_ZERO_RANGE is defined */ /* #undef HAVE_FALLOC_FL_ZERO_RANGE */ /* fault_in_iov_iter_readable() is available */ /* #undef HAVE_FAULT_IN_IOV_ITER_READABLE */ /* filemap_range_has_page() is available */ /* #undef HAVE_FILEMAP_RANGE_HAS_PAGE */ /* fops->aio_fsync() exists */ /* #undef HAVE_FILE_AIO_FSYNC */ /* file_dentry() is available */ /* #undef HAVE_FILE_DENTRY */ /* fops->fadvise() exists */ /* #undef HAVE_FILE_FADVISE */ /* file_inode() is available */ /* #undef HAVE_FILE_INODE */ /* flush_dcache_page() is GPL-only */ /* #undef HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY */ /* iops->follow_link() cookie */ /* #undef HAVE_FOLLOW_LINK_COOKIE */ /* iops->follow_link() nameidata */ /* #undef HAVE_FOLLOW_LINK_NAMEIDATA */ /* Define if compiler supports -Wformat-overflow */ /* #undef HAVE_FORMAT_OVERFLOW */ /* fops->fsync() with range */ /* #undef HAVE_FSYNC_RANGE */ /* fops->fsync() without dentry */ /* #undef HAVE_FSYNC_WITHOUT_DENTRY */ /* yes */ /* #undef HAVE_GENERIC_FADVISE */ /* generic_fillattr requires struct mnt_idmap* */ /* #undef HAVE_GENERIC_FILLATTR_IDMAP */ /* generic_fillattr requires struct user_namespace* */ /* #undef HAVE_GENERIC_FILLATTR_USERNS */ /* generic_*_io_acct() 3 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_3ARG */ /* generic_*_io_acct() 4 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_4ARG */ /* generic_readlink is global */ /* #undef HAVE_GENERIC_READLINK */ /* generic_setxattr() exists */ /* #undef HAVE_GENERIC_SETXATTR */ /* generic_write_checks() takes kiocb */ /* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */ /* Define if the GNU gettext() function is already present or preinstalled. */ /* #undef HAVE_GETTEXT */ /* iops->get_acl() exists */ /* #undef HAVE_GET_ACL */ /* iops->get_acl() takes rcu */ /* #undef HAVE_GET_ACL_RCU */ /* has iops->get_inode_acl() */ /* #undef HAVE_GET_INODE_ACL */ /* iops->get_link() cookie */ /* #undef HAVE_GET_LINK_COOKIE */ /* iops->get_link() delayed */ /* #undef HAVE_GET_LINK_DELAYED */ /* group_info->gid exists */ /* #undef HAVE_GROUP_INFO_GID */ /* has_capability() is available */ /* #undef HAVE_HAS_CAPABILITY */ /* iattr->ia_vfsuid and iattr->ia_vfsgid exist */ /* #undef HAVE_IATTR_VFSID */ /* Define if you have the iconv() function and it works. */ #define HAVE_ICONV 1 /* iops->getattr() takes struct mnt_idmap* */ /* #undef HAVE_IDMAP_IOPS_GETATTR */ /* iops->setattr() takes struct mnt_idmap* */ /* #undef HAVE_IDMAP_IOPS_SETATTR */ /* APIs for idmapped mount are present */ /* #undef HAVE_IDMAP_MNT_API */ /* Define if compiler supports -Wimplicit-fallthrough */ /* #undef HAVE_IMPLICIT_FALLTHROUGH */ /* Define if compiler supports -Winfinite-recursion */ /* #undef HAVE_INFINITE_RECURSION */ /* yes */ /* #undef HAVE_INODE_LOCK_SHARED */ /* inode_owner_or_capable() exists */ /* #undef HAVE_INODE_OWNER_OR_CAPABLE */ /* inode_owner_or_capable() takes mnt_idmap */ /* #undef HAVE_INODE_OWNER_OR_CAPABLE_IDMAP */ /* inode_owner_or_capable() takes user_ns */ /* #undef HAVE_INODE_OWNER_OR_CAPABLE_USERNS */ /* inode_set_flags() exists */ /* #undef HAVE_INODE_SET_FLAGS */ /* inode_set_iversion() exists */ /* #undef HAVE_INODE_SET_IVERSION */ /* inode->i_*time's are timespec64 */ /* #undef HAVE_INODE_TIMESPEC64_TIMES */ /* timestamp_truncate() exists */ /* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */ /* Define to 1 if you have the header file. */ #define HAVE_INTTYPES_H 1 /* in_compat_syscall() is available */ /* #undef HAVE_IN_COMPAT_SYSCALL */ /* iops->create() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_CREATE_IDMAP */ /* iops->create() takes struct user_namespace* */ /* #undef HAVE_IOPS_CREATE_USERNS */ /* iops->mkdir() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_MKDIR_IDMAP */ /* iops->mkdir() takes struct user_namespace* */ /* #undef HAVE_IOPS_MKDIR_USERNS */ /* iops->mknod() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_MKNOD_IDMAP */ /* iops->mknod() takes struct user_namespace* */ /* #undef HAVE_IOPS_MKNOD_USERNS */ /* iops->permission() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_PERMISSION_IDMAP */ /* iops->permission() takes struct user_namespace* */ /* #undef HAVE_IOPS_PERMISSION_USERNS */ /* iops->rename() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_RENAME_IDMAP */ /* iops->rename() takes struct user_namespace* */ /* #undef HAVE_IOPS_RENAME_USERNS */ /* iops->setattr() exists */ /* #undef HAVE_IOPS_SETATTR */ /* iops->symlink() takes struct mnt_idmap* */ /* #undef HAVE_IOPS_SYMLINK_IDMAP */ /* iops->symlink() takes struct user_namespace* */ /* #undef HAVE_IOPS_SYMLINK_USERNS */ /* iov_iter_advance() is available */ /* #undef HAVE_IOV_ITER_ADVANCE */ /* iov_iter_count() is available */ /* #undef HAVE_IOV_ITER_COUNT */ /* iov_iter_fault_in_readable() is available */ /* #undef HAVE_IOV_ITER_FAULT_IN_READABLE */ /* iov_iter_revert() is available */ /* #undef HAVE_IOV_ITER_REVERT */ /* iov_iter_type() is available */ /* #undef HAVE_IOV_ITER_TYPE */ /* iov_iter types are available */ /* #undef HAVE_IOV_ITER_TYPES */ /* yes */ /* #undef HAVE_IO_SCHEDULE_TIMEOUT */ /* Define to 1 if you have the `issetugid' function. */ #define HAVE_ISSETUGID 1 /* kernel has kernel_fpu_* functions */ /* #undef HAVE_KERNEL_FPU */ /* kernel has asm/fpu/api.h */ /* #undef HAVE_KERNEL_FPU_API_HEADER */ /* kernel fpu internal */ /* #undef HAVE_KERNEL_FPU_INTERNAL */ /* kernel has asm/fpu/internal.h */ /* #undef HAVE_KERNEL_FPU_INTERNAL_HEADER */ /* uncached_acl_sentinel() exists */ /* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */ /* Define if compiler supports -Winfinite-recursion */ /* #undef HAVE_KERNEL_INFINITE_RECURSION */ /* kernel does stack verification */ /* #undef HAVE_KERNEL_OBJTOOL */ /* kernel has linux/objtool.h */ /* #undef HAVE_KERNEL_OBJTOOL_HEADER */ /* kernel_read() take loff_t pointer */ /* #undef HAVE_KERNEL_READ_PPOS */ /* timer_list.function gets a timer_list */ /* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */ /* struct timer_list has a flags member */ /* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */ /* timer_setup() is available */ /* #undef HAVE_KERNEL_TIMER_SETUP */ /* kernel_write() take loff_t pointer */ /* #undef HAVE_KERNEL_WRITE_PPOS */ /* kmem_cache_create_usercopy() exists */ /* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */ /* kstrtoul() exists */ /* #undef HAVE_KSTRTOUL */ /* ktime_get_coarse_real_ts64() exists */ /* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */ /* ktime_get_raw_ts64() exists */ /* #undef HAVE_KTIME_GET_RAW_TS64 */ /* kvmalloc exists */ /* #undef HAVE_KVMALLOC */ /* Define if you have [aio] */ /* #undef HAVE_LIBAIO */ /* Define if you have [blkid] */ /* #undef HAVE_LIBBLKID */ /* Define if you have [crypto] */ #define HAVE_LIBCRYPTO 1 /* Define if you have [tirpc] */ /* #undef HAVE_LIBTIRPC */ /* Define if you have [udev] */ /* #undef HAVE_LIBUDEV */ /* Define if you have [uuid] */ /* #undef HAVE_LIBUUID */ /* linux/blk-cgroup.h exists */ /* #undef HAVE_LINUX_BLK_CGROUP_HEADER */ /* lseek_execute() is available */ /* #undef HAVE_LSEEK_EXECUTE */ /* makedev() is declared in sys/mkdev.h */ /* #undef HAVE_MAKEDEV_IN_MKDEV */ /* makedev() is declared in sys/sysmacros.h */ /* #undef HAVE_MAKEDEV_IN_SYSMACROS */ /* Noting that make_request_fn() returns blk_qc_t */ /* #undef HAVE_MAKE_REQUEST_FN_RET_QC */ /* Noting that make_request_fn() returns void */ /* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */ /* iops->mkdir() takes umode_t */ /* #undef HAVE_MKDIR_UMODE_T */ /* Define to 1 if you have the `mlockall' function. */ #define HAVE_MLOCKALL 1 /* lookup_bdev() wants mode arg */ /* #undef HAVE_MODE_LOOKUP_BDEV */ /* Define if host toolchain supports MOVBE */ #define HAVE_MOVBE 1 /* new_sync_read()/new_sync_write() are available */ /* #undef HAVE_NEW_SYNC_READ */ /* folio_wait_bit() exists */ /* #undef HAVE_PAGEMAP_FOLIO_WAIT_BIT */ /* part_to_dev() exists */ /* #undef HAVE_PART_TO_DEV */ /* iops->getattr() takes a path */ /* #undef HAVE_PATH_IOPS_GETATTR */ /* Define if host toolchain supports PCLMULQDQ */ #define HAVE_PCLMULQDQ 1 /* percpu_counter_add_batch() is defined */ /* #undef HAVE_PERCPU_COUNTER_ADD_BATCH */ /* percpu_counter_init() wants gfp_t */ /* #undef HAVE_PERCPU_COUNTER_INIT_WITH_GFP */ /* posix_acl_chmod() exists */ /* #undef HAVE_POSIX_ACL_CHMOD */ /* posix_acl_from_xattr() needs user_ns */ /* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */ /* posix_acl_release() is available */ /* #undef HAVE_POSIX_ACL_RELEASE */ /* posix_acl_release() is GPL-only */ /* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */ /* posix_acl_valid() wants user namespace */ /* #undef HAVE_POSIX_ACL_VALID_WITH_NS */ /* proc_ops structure exists */ /* #undef HAVE_PROC_OPS_STRUCT */ /* iops->put_link() cookie */ /* #undef HAVE_PUT_LINK_COOKIE */ /* iops->put_link() delayed */ /* #undef HAVE_PUT_LINK_DELAYED */ /* iops->put_link() nameidata */ /* #undef HAVE_PUT_LINK_NAMEIDATA */ /* If available, contains the Python version number currently in use. */ #define HAVE_PYTHON "3.7" /* qat is enabled and existed */ /* #undef HAVE_QAT */ /* struct reclaim_state has reclaimed */ /* #undef HAVE_RECLAIM_STATE_RECLAIMED */ /* register_shrinker is vararg */ /* #undef HAVE_REGISTER_SHRINKER_VARARG */ /* iops->rename2() exists */ /* #undef HAVE_RENAME2 */ /* struct inode_operations_wrapper takes .rename2() */ /* #undef HAVE_RENAME2_OPERATIONS_WRAPPER */ /* iops->rename() wants flags */ /* #undef HAVE_RENAME_WANTS_FLAGS */ /* REQ_DISCARD is defined */ /* #undef HAVE_REQ_DISCARD */ /* REQ_FLUSH is defined */ /* #undef HAVE_REQ_FLUSH */ /* REQ_OP_DISCARD is defined */ /* #undef HAVE_REQ_OP_DISCARD */ /* REQ_OP_FLUSH is defined */ /* #undef HAVE_REQ_OP_FLUSH */ /* REQ_OP_SECURE_ERASE is defined */ /* #undef HAVE_REQ_OP_SECURE_ERASE */ /* REQ_PREFLUSH is defined */ /* #undef HAVE_REQ_PREFLUSH */ /* revalidate_disk() is available */ /* #undef HAVE_REVALIDATE_DISK */ /* revalidate_disk_size() is available */ /* #undef HAVE_REVALIDATE_DISK_SIZE */ /* struct rw_semaphore has member activity */ /* #undef HAVE_RWSEM_ACTIVITY */ /* struct rw_semaphore has atomic_long_t member count */ /* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */ /* linux/sched/signal.h exists */ /* #undef HAVE_SCHED_SIGNAL_HEADER */ /* Define to 1 if you have the header file. */ #define HAVE_SECURITY_PAM_MODULES_H 1 /* setattr_prepare() accepts mnt_idmap */ /* #undef HAVE_SETATTR_PREPARE_IDMAP */ /* setattr_prepare() is available, doesn't accept user_namespace */ /* #undef HAVE_SETATTR_PREPARE_NO_USERNS */ /* setattr_prepare() accepts user_namespace */ /* #undef HAVE_SETATTR_PREPARE_USERNS */ /* iops->set_acl() exists, takes 3 args */ /* #undef HAVE_SET_ACL */ /* iops->set_acl() takes 4 args, arg1 is struct mnt_idmap * */ /* #undef HAVE_SET_ACL_IDMAP_DENTRY */ /* iops->set_acl() takes 4 args */ /* #undef HAVE_SET_ACL_USERNS */ /* iops->set_acl() takes 4 args, arg2 is struct dentry * */ /* #undef HAVE_SET_ACL_USERNS_DENTRY_ARG2 */ /* set_cached_acl() is usable */ /* #undef HAVE_SET_CACHED_ACL_USABLE */ /* set_special_state() exists */ /* #undef HAVE_SET_SPECIAL_STATE */ /* struct shrink_control exists */ /* #undef HAVE_SHRINK_CONTROL_STRUCT */ /* kernel_siginfo_t exists */ /* #undef HAVE_SIGINFO */ /* signal_stop() exists */ /* #undef HAVE_SIGNAL_STOP */ /* new shrinker callback wants 2 args */ /* #undef HAVE_SINGLE_SHRINKER_CALLBACK */ /* cs->count_objects exists */ /* #undef HAVE_SPLIT_SHRINKER_CALLBACK */ #if defined(__amd64__) || defined(__i386__) /* Define if host toolchain supports SSE */ #define HAVE_SSE 1 /* Define if host toolchain supports SSE2 */ #define HAVE_SSE2 1 /* Define if host toolchain supports SSE3 */ #define HAVE_SSE3 1 /* Define if host toolchain supports SSE4.1 */ #define HAVE_SSE4_1 1 /* Define if host toolchain supports SSE4.2 */ #define HAVE_SSE4_2 1 /* Define if host toolchain supports SSSE3 */ #define HAVE_SSSE3 1 #endif /* STACK_FRAME_NON_STANDARD is defined */ /* #undef HAVE_STACK_FRAME_NON_STANDARD */ /* standalone exists */ /* #undef HAVE_STANDALONE_LINUX_STDARG */ /* Define to 1 if you have the header file. */ #define HAVE_STDINT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDIO_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDLIB_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRINGS_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRING_H 1 /* Define to 1 if you have the `strlcat' function. */ #define HAVE_STRLCAT 1 /* Define to 1 if you have the `strlcpy' function. */ #define HAVE_STRLCPY 1 /* submit_bio is member of struct block_device_operations */ /* #undef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ /* super_setup_bdi_name() exits */ /* #undef HAVE_SUPER_SETUP_BDI_NAME */ /* super_block->s_user_ns exists */ /* #undef HAVE_SUPER_USER_NS */ /* struct kobj_type has default_groups */ /* #undef HAVE_SYSFS_DEFAULT_GROUPS */ /* Define to 1 if you have the header file. */ #define HAVE_SYS_STAT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 /* i_op->tmpfile() exists */ /* #undef HAVE_TMPFILE */ /* i_op->tmpfile() uses old dentry signature */ /* #undef HAVE_TMPFILE_DENTRY */ /* i_op->tmpfile() has mnt_idmap */ /* #undef HAVE_TMPFILE_IDMAP */ /* i_op->tmpfile() has userns */ /* #undef HAVE_TMPFILE_USERNS */ /* totalhigh_pages() exists */ /* #undef HAVE_TOTALHIGH_PAGES */ /* kernel has totalram_pages() */ /* #undef HAVE_TOTALRAM_PAGES_FUNC */ /* Define to 1 if you have the `udev_device_get_is_initialized' function. */ /* #undef HAVE_UDEV_DEVICE_GET_IS_INITIALIZED */ /* kernel has __kernel_fpu_* functions */ /* #undef HAVE_UNDERSCORE_KERNEL_FPU */ /* Define to 1 if you have the header file. */ #define HAVE_UNISTD_H 1 /* iops->getattr() takes struct user_namespace* */ /* #undef HAVE_USERNS_IOPS_GETATTR */ /* iops->setattr() takes struct user_namespace* */ /* #undef HAVE_USERNS_IOPS_SETATTR */ /* user_namespace->ns.inum exists */ /* #undef HAVE_USER_NS_COMMON_INUM */ /* iops->getattr() takes a vfsmount */ /* #undef HAVE_VFSMOUNT_IOPS_GETATTR */ /* aops->direct_IO() uses iovec */ /* #undef HAVE_VFS_DIRECT_IO_IOVEC */ /* aops->direct_IO() uses iov_iter without rw */ /* #undef HAVE_VFS_DIRECT_IO_ITER */ /* aops->direct_IO() uses iov_iter with offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */ /* aops->direct_IO() uses iov_iter with rw and offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */ /* filemap_dirty_folio exists */ /* #undef HAVE_VFS_FILEMAP_DIRTY_FOLIO */ /* All required iov_iter interfaces are available */ /* #undef HAVE_VFS_IOV_ITER */ /* fops->iterate() is available */ /* #undef HAVE_VFS_ITERATE */ /* fops->iterate_shared() is available */ /* #undef HAVE_VFS_ITERATE_SHARED */ /* fops->readdir() is available */ /* #undef HAVE_VFS_READDIR */ /* address_space_operations->readpages exists */ /* #undef HAVE_VFS_READPAGES */ /* read_folio exists */ /* #undef HAVE_VFS_READ_FOLIO */ /* fops->read/write_iter() are available */ /* #undef HAVE_VFS_RW_ITERATE */ /* __set_page_dirty_nobuffers exists */ /* #undef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS */ /* __vmalloc page flags exists */ /* #undef HAVE_VMALLOC_PAGE_KERNEL */ /* yes */ /* #undef HAVE_WAIT_ON_BIT_ACTION */ /* wait_queue_entry_t exists */ /* #undef HAVE_WAIT_QUEUE_ENTRY_T */ /* wq_head->head and wq_entry->entry exist */ /* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */ /* int (*writepage_t)() takes struct folio* */ /* #undef HAVE_WRITEPAGE_T_FOLIO */ /* xattr_handler->get() wants dentry */ /* #undef HAVE_XATTR_GET_DENTRY */ /* xattr_handler->get() wants both dentry and inode */ /* #undef HAVE_XATTR_GET_DENTRY_INODE */ /* xattr_handler->get() wants dentry and inode and flags */ /* #undef HAVE_XATTR_GET_DENTRY_INODE_FLAGS */ /* xattr_handler->get() wants xattr_handler */ /* #undef HAVE_XATTR_GET_HANDLER */ /* xattr_handler has name */ /* #undef HAVE_XATTR_HANDLER_NAME */ /* xattr_handler->list() wants dentry */ /* #undef HAVE_XATTR_LIST_DENTRY */ /* xattr_handler->list() wants xattr_handler */ /* #undef HAVE_XATTR_LIST_HANDLER */ /* xattr_handler->list() wants simple */ /* #undef HAVE_XATTR_LIST_SIMPLE */ /* xattr_handler->set() wants dentry */ /* #undef HAVE_XATTR_SET_DENTRY */ /* xattr_handler->set() wants both dentry and inode */ /* #undef HAVE_XATTR_SET_DENTRY_INODE */ /* xattr_handler->set() wants xattr_handler */ /* #undef HAVE_XATTR_SET_HANDLER */ /* xattr_handler->set() takes mnt_idmap */ /* #undef HAVE_XATTR_SET_IDMAP */ /* xattr_handler->set() takes user_namespace */ /* #undef HAVE_XATTR_SET_USERNS */ /* Define if host toolchain supports XSAVE */ #define HAVE_XSAVE 1 /* Define if host toolchain supports XSAVEOPT */ #define HAVE_XSAVEOPT 1 /* Define if host toolchain supports XSAVES */ #define HAVE_XSAVES 1 /* ZERO_PAGE() is GPL-only */ /* #undef HAVE_ZERO_PAGE_GPL_ONLY */ /* Define if you have [z] */ #define HAVE_ZLIB 1 /* __posix_acl_chmod() exists */ /* #undef HAVE___POSIX_ACL_CHMOD */ /* kernel exports FPU functions */ /* #undef KERNEL_EXPORTS_X86_FPU */ /* TBD: fetch(3) support */ #if 0 /* whether the chosen libfetch is to be loaded at run-time */ #define LIBFETCH_DYNAMIC 1 /* libfetch is fetch(3) */ #define LIBFETCH_IS_FETCH 1 /* libfetch is libcurl */ #define LIBFETCH_IS_LIBCURL 0 /* soname of chosen libfetch */ #define LIBFETCH_SONAME "libfetch.so.6" #endif /* Define to the sub-directory where libtool stores uninstalled libraries. */ #define LT_OBJDIR ".libs/" /* make_request_fn() return type */ /* #undef MAKE_REQUEST_FN_RET */ /* struct shrink_control has nid */ /* #undef SHRINK_CONTROL_HAS_NID */ /* using complete_and_exit() instead */ /* #undef SPL_KTHREAD_COMPLETE_AND_EXIT */ /* Defined for legacy compatibility. */ #define SPL_META_ALIAS ZFS_META_ALIAS /* Defined for legacy compatibility. */ #define SPL_META_RELEASE ZFS_META_RELEASE /* Defined for legacy compatibility. */ #define SPL_META_VERSION ZFS_META_VERSION /* pde_data() is PDE_DATA() */ /* #undef SPL_PDE_DATA */ /* Define to 1 if all of the C90 standard headers exist (not just the ones required in a freestanding environment). This macro is provided for backward compatibility; new code need not use it. */ #define SYSTEM_FREEBSD 1 /* True if ZFS is to be compiled for a Linux system */ /* #undef SYSTEM_LINUX */ /* Version number of package */ /* #undef ZFS_DEBUG */ /* /dev/zfs minor */ /* #undef ZFS_DEVICE_MINOR */ /* enum node_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */ /* enum node_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum node_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */ /* enum zone_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */ /* enum zone_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum zone_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */ /* GENHD_FL_EXT_DEVT flag is not available */ /* #undef ZFS_GENHD_FL_EXT_DEVT */ /* GENHD_FL_NO_PART_SCAN flag is available */ /* #undef ZFS_GENHD_FL_NO_PART */ /* global_node_page_state() exists */ /* #undef ZFS_GLOBAL_NODE_PAGE_STATE */ /* global_zone_page_state() exists */ /* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */ /* Define to 1 if GPL-only symbols can be used */ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_gfeff9dfed" +#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_g10e36e176" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" /* Define the project release date. */ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ #define ZFS_META_KVER_MAX "6.3" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "3.10" /* Define the project license. */ #define ZFS_META_LICENSE "CDDL" /* Define the libtool library 'age' version information. */ /* #undef ZFS_META_LT_AGE */ /* Define the libtool library 'current' version information. */ /* #undef ZFS_META_LT_CURRENT */ /* Define the libtool library 'revision' version information. */ /* #undef ZFS_META_LT_REVISION */ /* Define the project name. */ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "FreeBSD_gfeff9dfed" +#define ZFS_META_RELEASE "FreeBSD_g10e36e176" /* Define the project version. */ #define ZFS_META_VERSION "2.1.99" /* count is located in percpu_ref.data */ /* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */ diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index 932bf9730c7a..668f15e85337 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.1.99-1993-gfeff9dfed" +#define ZFS_META_GITREV "zfs-2.1.99-1998-g10e36e176"