diff --git a/include/sys/arc.h b/include/sys/arc.h index 5148905c93d8..0e0a828c1d93 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -1,360 +1,369 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. */ #ifndef _SYS_ARC_H #define _SYS_ARC_H #include #ifdef __cplusplus extern "C" { #endif #include #include #include #include /* * Used by arc_flush() to inform arc_evict_state() that it should evict * all available buffers from the arc state being passed in. */ #define ARC_EVICT_ALL UINT64_MAX /* * ZFS gets very unhappy when the maximum ARC size is smaller than the maximum * block size and a larger block is written. To leave some safety margin, we * limit the minimum for zfs_arc_max to the maximium transaction size. */ #define MIN_ARC_MAX DMU_MAX_ACCESS #define HDR_SET_LSIZE(hdr, x) do { \ ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ } while (0) #define HDR_SET_PSIZE(hdr, x) do { \ ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ } while (0) +/* The l2size in the header is only used by L2 cache */ +#define HDR_SET_L2SIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_l2size = ((x) >> SPA_MINBLOCKSHIFT); \ +} while (0) + #define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) #define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) +#define HDR_GET_L2SIZE(hdr) ((hdr)->b_l2size << SPA_MINBLOCKSHIFT) typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef struct arc_prune arc_prune_t; /* * Because the ARC can store encrypted data, errors (not due to bugs) may arise * while transforming data into its desired format - specifically, when * decrypting, the key may not be present, or the HMAC may not be correct * which signifies deliberate tampering with the on-disk state * (assuming that the checksum was correct). If any error occurs, the "buf" * parameter will be NULL. */ typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *priv); typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); typedef void arc_prune_func_t(uint64_t bytes, void *priv); /* Shared module parameters */ extern uint_t zfs_arc_average_blocksize; extern int l2arc_exclude_special; /* generic arc_done_func_t's which you can use */ arc_read_done_func_t arc_bcopy_func; arc_read_done_func_t arc_getbuf_func; /* generic arc_prune_func_t wrapper for callbacks */ struct arc_prune { arc_prune_func_t *p_pfunc; void *p_private; uint64_t p_adjust; list_node_t p_node; zfs_refcount_t p_refcnt; }; typedef enum arc_strategy { ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */ ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */ } arc_strategy_t; typedef enum arc_flags { /* * Public flags that can be passed into the ARC by external consumers. */ ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ ARC_FLAG_UNCACHED = 1 << 5, /* evict after use */ ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_buf_hdr_t. These flags should * only be set by ARC code. */ ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */ /* Indicates that block was read with ASYNC priority. */ ARC_FLAG_PRIO_ASYNC_READ = 1 << 11, ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */ ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */ ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */ /* * Encrypted or authenticated on disk (may be plaintext in memory). * This header has b_crypt_hdr allocated. Does not include indirect * blocks with checksums of MACs which will also have their X * (encrypted) bit set in the bp. */ ARC_FLAG_PROTECTED = 1 << 15, /* data has not been authenticated yet */ ARC_FLAG_NOAUTH = 1 << 16, /* indicates that the buffer contains metadata (otherwise, data) */ ARC_FLAG_BUFC_METADATA = 1 << 17, /* Flags specifying whether optional hdr struct fields are defined */ ARC_FLAG_HAS_L1HDR = 1 << 18, ARC_FLAG_HAS_L2HDR = 1 << 19, /* * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. * This allows the l2arc to use the blkptr's checksum to verify * the data without having to store the checksum in the hdr. */ ARC_FLAG_COMPRESSED_ARC = 1 << 20, ARC_FLAG_SHARED_DATA = 1 << 21, /* * Fail this arc_read() (with ENOENT) if the data is not already present * in cache. */ ARC_FLAG_CACHED_ONLY = 1 << 22, /* * Don't instantiate an arc_buf_t for arc_read_done. */ ARC_FLAG_NO_BUF = 1 << 23, /* * The arc buffer's compression mode is stored in the top 7 bits of the * flags field, so these dummy flags are included so that MDB can * interpret the enum properly. */ ARC_FLAG_COMPRESS_0 = 1 << 24, ARC_FLAG_COMPRESS_1 = 1 << 25, ARC_FLAG_COMPRESS_2 = 1 << 26, ARC_FLAG_COMPRESS_3 = 1 << 27, ARC_FLAG_COMPRESS_4 = 1 << 28, ARC_FLAG_COMPRESS_5 = 1 << 29, ARC_FLAG_COMPRESS_6 = 1 << 30 } arc_flags_t; typedef enum arc_buf_flags { ARC_BUF_FLAG_SHARED = 1 << 0, ARC_BUF_FLAG_COMPRESSED = 1 << 1, /* * indicates whether this arc_buf_t is encrypted, regardless of * state on-disk */ ARC_BUF_FLAG_ENCRYPTED = 1 << 2 } arc_buf_flags_t; struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; void *b_data; arc_buf_flags_t b_flags; }; typedef enum arc_buf_contents { ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES } arc_buf_contents_t; /* * The following breakdowns of arc_size exist for kstat only. */ typedef enum arc_space_type { ARC_SPACE_DATA, ARC_SPACE_META, ARC_SPACE_HDRS, ARC_SPACE_L2HDRS, ARC_SPACE_DBUF, ARC_SPACE_DNODE, ARC_SPACE_BONUS, ARC_SPACE_ABD_CHUNK_WASTE, ARC_SPACE_NUMTYPES } arc_space_type_t; typedef enum arc_state_type { ARC_STATE_ANON, ARC_STATE_MRU, ARC_STATE_MRU_GHOST, ARC_STATE_MFU, ARC_STATE_MFU_GHOST, ARC_STATE_L2C_ONLY, ARC_STATE_UNCACHED, ARC_STATE_NUMTYPES } arc_state_type_t; typedef struct arc_buf_info { arc_state_type_t abi_state_type; arc_buf_contents_t abi_state_contents; uint32_t abi_flags; uint32_t abi_bufcnt; uint64_t abi_size; uint64_t abi_spa; uint64_t abi_access; uint32_t abi_mru_hits; uint32_t abi_mru_ghost_hits; uint32_t abi_mfu_hits; uint32_t abi_mfu_ghost_hits; uint32_t abi_l2arc_hits; uint32_t abi_holds; uint64_t abi_l2arc_dattr; uint64_t abi_l2arc_asize; enum zio_compress abi_l2arc_compress; } arc_buf_info_t; /* * Flags returned by arc_cached; describes which part of the arc * the block is cached in. */ #define ARC_CACHED_EMBEDDED (1U << 0) #define ARC_CACHED_IN_L1 (1U << 1) #define ARC_CACHED_IN_MRU (1U << 2) #define ARC_CACHED_IN_MFU (1U << 3) #define ARC_CACHED_IN_L2 (1U << 4) void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); boolean_t arc_is_metadata(arc_buf_t *buf); boolean_t arc_is_encrypted(arc_buf_t *buf); boolean_t arc_is_unauthenticated(arc_buf_t *buf); enum zio_compress arc_get_compression(arc_buf_t *buf); void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, uint8_t *iv, uint8_t *mac); int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, boolean_t in_place); void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac); arc_buf_t *arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type, int32_t size); arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); arc_buf_t *arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); uint8_t arc_get_complevel(arc_buf_t *buf); arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size); arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); arc_buf_t *arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel); void arc_return_buf(arc_buf_t *buf, const void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, const void *tag); void arc_buf_destroy(arc_buf_t *buf, const void *tag); void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); uint64_t arc_buf_size(arc_buf_t *buf); uint64_t arc_buf_lsize(arc_buf_t *buf); void arc_buf_access(arc_buf_t *buf); void arc_release(arc_buf_t *buf, const void *tag); int arc_released(arc_buf_t *buf); void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #else #define arc_referenced(buf) ((void) sizeof (buf), 0) #endif int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *priv, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, arc_write_done_func_t *done, void *priv, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb); arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); void arc_remove_prune_callback(arc_prune_t *p); void arc_freed(spa_t *spa, const blkptr_t *bp); int arc_cached(spa_t *spa, const blkptr_t *bp); void arc_flush(spa_t *spa, boolean_t retry); +void arc_flush_async(spa_t *spa); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); +boolean_t arc_async_flush_guid_inuse(uint64_t load_guid); uint64_t arc_all_memory(void); uint64_t arc_default_max(uint64_t min, uint64_t allmem); uint64_t arc_target_bytes(void); void arc_set_limits(uint64_t); void arc_init(void); void arc_fini(void); /* * Level 2 ARC */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check); void l2arc_init(void); void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); void l2arc_spa_rebuild_start(spa_t *spa); void l2arc_spa_rebuild_stop(spa_t *spa); #ifndef _KERNEL extern boolean_t arc_watch; #endif #ifdef __cplusplus } #endif #endif /* _SYS_ARC_H */ diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index b2839bdf1485..2cf6aa3b1825 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -1,1093 +1,1093 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, Delphix. All rights reserved. * Copyright (c) 2013, Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. */ #ifndef _SYS_ARC_IMPL_H #define _SYS_ARC_IMPL_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recently used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache * ARC_uncached - uncacheable prefetch, to be evicted * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are * the only buffers that can be evicted or deleted. Within each * state there are multiple lists, one for meta-data and one for * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, * etc.) is tracked separately so that it can be managed more * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will acquire a DVA * as they are written and migrate onto the arc_mru list. * * The ARC_l2c_only state is for buffers that are in the second * level ARC but no longer in any of the ARC_m* lists. The second * level ARC itself may also contain buffers that are in any of * the ARC_m* states - meaning that a buffer can exist in two * places. The reason for the ARC_l2c_only state is to keep the * buffer header in the hash table, so that reads that hit the * second level ARC benefit from these fast lookups. */ typedef struct arc_state { /* * list of evictable buffers */ multilist_t arcs_list[ARC_BUFC_NUMTYPES]; /* * supports the "dbufs" kstat */ arc_state_type_t arcs_state; /* * total amount of data in this state. */ zfs_refcount_t arcs_size[ARC_BUFC_NUMTYPES] ____cacheline_aligned; /* * total amount of evictable data in this state */ zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * amount of hit bytes for this state (counted only for ghost states) */ wmsum_t arcs_hits[ARC_BUFC_NUMTYPES]; } arc_state_t; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_read_done_func_t *acb_done; arc_buf_t *acb_buf; boolean_t acb_encrypted; boolean_t acb_compressed; boolean_t acb_noauth; boolean_t acb_nobuf; boolean_t acb_wait; int acb_wait_error; kmutex_t acb_wait_lock; kcondvar_t acb_wait_cv; zbookmark_phys_t acb_zb; zio_t *acb_zio_dummy; zio_t *acb_zio_head; arc_callback_t *acb_prev; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { void *awcb_private; arc_write_done_func_t *awcb_ready; arc_write_done_func_t *awcb_children_ready; arc_write_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; /* * ARC buffers are separated into multiple structs as a memory saving measure: * - Common fields struct, always defined, and embedded within it: * - L2-only fields, always allocated but undefined when not in L2ARC * - L1-only fields, only allocated when in L1ARC * * Buffer in L1 Buffer only in L2 * +------------------------+ +------------------------+ * | arc_buf_hdr_t | | arc_buf_hdr_t | * | | | | * | | | | * | | | | * +------------------------+ +------------------------+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | * | (undefined if L1-only) | | | * +------------------------+ +------------------------+ * | l1arc_buf_hdr_t | * | | * | | * | | * | | * +------------------------+ * * Because it's possible for the L2ARC to become extremely large, we can wind * up eating a lot of memory in L2ARC buffer headers, so the size of a header * is minimized by only allocating the fields necessary for an L1-cached buffer * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple * words in pointers. arc_hdr_realloc() is used to switch a header between * these two allocation states. */ typedef struct l1arc_buf_hdr { /* protected by arc state mutex */ arc_state_t *b_state; multilist_node_t b_arc_node; /* protected by hash lock */ clock_t b_arc_access; uint32_t b_mru_hits; uint32_t b_mru_ghost_hits; uint32_t b_mfu_hits; uint32_t b_mfu_ghost_hits; uint8_t b_byteswap; arc_buf_t *b_buf; /* self protecting */ zfs_refcount_t b_refcnt; arc_callback_t *b_acb; abd_t *b_pabd; #ifdef ZFS_DEBUG zio_cksum_t *b_freeze_cksum; kmutex_t b_freeze_lock; #endif } l1arc_buf_hdr_t; typedef enum l2arc_dev_hdr_flags_t { L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ } l2arc_dev_hdr_flags_t; /* * Pointer used in persistent L2ARC (for pointing to log blocks). */ typedef struct l2arc_log_blkptr { /* * Offset of log block within the device, in bytes */ uint64_t lbp_daddr; /* * Aligned payload size (in bytes) of the log block */ uint64_t lbp_payload_asize; /* * Offset in bytes of the first buffer in the payload */ uint64_t lbp_payload_start; /* * lbp_prop has the following format: * * logical size (in bytes) * * aligned (after compression) size (in bytes) * * compression algorithm (we always LZ4-compress l2arc logs) * * checksum algorithm (used for lbp_cksum) */ uint64_t lbp_prop; zio_cksum_t lbp_cksum; /* checksum of log */ } l2arc_log_blkptr_t; /* * The persistent L2ARC device header. * Byte order of magic determines whether 64-bit bswap of fields is necessary. */ typedef struct l2arc_dev_hdr_phys { uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ uint64_t dh_version; /* Persistent L2ARC version */ /* * Global L2ARC device state and metadata. */ uint64_t dh_spa_guid; uint64_t dh_vdev_guid; uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ uint64_t dh_evict; /* evicted offset in bytes */ uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ /* * Used in zdb.c for determining if a log block is valid, in the same * way that l2arc_rebuild() does. */ uint64_t dh_start; /* mirror of l2ad_start */ uint64_t dh_end; /* mirror of l2ad_end */ /* * Start of log block chain. [0] -> newest log, [1] -> one older (used * for initiating prefetch). */ l2arc_log_blkptr_t dh_start_lbps[2]; /* * Aligned size of all log blocks as accounted by vdev_space_update(). */ uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ /* * Mirrors of vdev_trim_action_time and vdev_trim_state, used to * display when the cache device was fully trimmed for the last * time. */ uint64_t dh_trim_action_time; uint64_t dh_trim_state; const uint64_t dh_pad[30]; /* pad to 512 bytes */ zio_eck_t dh_tail; } l2arc_dev_hdr_phys_t; _Static_assert(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE, "l2arc_dev_hdr_phys_t wrong size"); /* * A single ARC buffer header entry in a l2arc_log_blk_phys_t. */ typedef struct l2arc_log_ent_phys { dva_t le_dva; /* dva of buffer */ uint64_t le_birth; /* birth txg of buffer */ /* * le_prop has the following format: * * logical size (in bytes) * * physical (compressed) size (in bytes) * * compression algorithm * * object type (used to restore arc_buf_contents_t) * * protected status (used for encryption) * * prefetch status (used in l2arc_read_done()) */ uint64_t le_prop; uint64_t le_daddr; /* buf location on l2dev */ uint64_t le_complevel; /* * We pad the size of each entry to a power of 2 so that the size of * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, * because of the L2ARC_SET_*SIZE macros. */ const uint64_t le_pad[2]; /* pad to 64 bytes */ } l2arc_log_ent_phys_t; #define L2ARC_LOG_BLK_MAX_ENTRIES (1022) /* * A log block of up to 1022 ARC buffer log entries, chained into the * persistent L2ARC metadata linked list. Byte order of magic determines * whether 64-bit bswap of fields is necessary. */ typedef struct l2arc_log_blk_phys { uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ /* * There are 2 chains (headed by dh_start_lbps[2]), and this field * points back to the previous block in this chain. We alternate * which chain we append to, so they are time-wise and offset-wise * interleaved, but that is an optimization rather than for * correctness. */ l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ /* * Pad header section to 128 bytes */ uint64_t lb_pad[7]; /* Payload */ l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; } l2arc_log_blk_phys_t; /* 64K total */ /* * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. */ _Static_assert(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), 1ULL << SPA_MINBLOCKSHIFT), "l2arc_log_blk_phys_t misaligned"); _Static_assert(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE, "l2arc_log_blk_phys_t too small"); _Static_assert(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE, "l2arc_log_blk_phys_t too big"); /* * These structures hold in-flight abd buffers for log blocks as they're being * written to the L2ARC device. */ typedef struct l2arc_lb_abd_buf { abd_t *abd; list_node_t node; } l2arc_lb_abd_buf_t; /* * These structures hold pointers to log blocks present on the L2ARC device. */ typedef struct l2arc_lb_ptr_buf { l2arc_log_blkptr_t *lb_ptr; list_node_t node; } l2arc_lb_ptr_buf_t; /* Macros for setting fields in le_prop and lbp_prop */ #define L2BLK_GET_LSIZE(field) \ BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) #define L2BLK_SET_LSIZE(field, x) \ BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) #define L2BLK_GET_PSIZE(field) \ BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) #define L2BLK_SET_PSIZE(field, x) \ BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) #define L2BLK_GET_COMPRESS(field) \ BF64_GET((field), 32, SPA_COMPRESSBITS) #define L2BLK_SET_COMPRESS(field, x) \ BF64_SET((field), 32, SPA_COMPRESSBITS, x) #define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) /* +/- 1 here are to keep compatibility after ARC_BUFC_INVALID removal. */ #define L2BLK_GET_TYPE(field) (BF64_GET((field), 48, 8) - 1) #define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, (x) + 1) #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) #define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) #define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x) #define PTR_SWAP(x, y) \ do { \ void *tmp = (x);\ x = y; \ y = tmp; \ } while (0) #define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ #define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ /* * L2ARC Internals */ typedef struct l2arc_dev { - vdev_t *l2ad_vdev; /* vdev */ - spa_t *l2ad_spa; /* spa */ + vdev_t *l2ad_vdev; /* can be NULL during remove */ + spa_t *l2ad_spa; /* can be NULL during remove */ uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ kmutex_t l2ad_mtx; /* lock for buffer list */ list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ zfs_refcount_t l2ad_alloc; /* allocated bytes */ /* * Persistence-related stuff */ l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ int l2ad_log_ent_idx; /* index into cur log blk */ /* Number of bytes in current log block's payload */ uint64_t l2ad_log_blk_payload_asize; /* * Offset (in bytes) of the first buffer in current log block's * payload. */ uint64_t l2ad_log_blk_payload_start; /* Flag indicating whether a rebuild is scheduled or is going on */ boolean_t l2ad_rebuild; boolean_t l2ad_rebuild_cancel; boolean_t l2ad_rebuild_began; uint64_t l2ad_log_entries; /* entries per log blk */ uint64_t l2ad_evict; /* evicted offset in bytes */ /* List of pointers to log blocks present in the L2ARC device */ list_t l2ad_lbptr_list; /* * Aligned size of all log blocks as accounted by vdev_space_update(). */ zfs_refcount_t l2ad_lb_asize; /* * Number of log blocks present on the device. */ zfs_refcount_t l2ad_lb_count; boolean_t l2ad_trim_all; /* TRIM whole device */ } l2arc_dev_t; /* * Encrypted blocks will need to be stored encrypted on the L2ARC * disk as they appear in the main pool. In order for this to work we * need to pass around the encryption parameters so they can be used * to write data to the L2ARC. This struct is only defined in the * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED * flag set. */ typedef struct arc_buf_hdr_crypt { abd_t *b_rabd; /* raw encrypted data */ /* dsobj for looking up encryption key for l2arc encryption */ uint64_t b_dsobj; dmu_object_type_t b_ot; /* object type */ /* encryption parameters */ uint8_t b_salt[ZIO_DATA_SALT_LEN]; uint8_t b_iv[ZIO_DATA_IV_LEN]; /* * Technically this could be removed since we will always be able to * get the mac from the bp when we need it. However, it is inconvenient * for callers of arc code to have to pass a bp in all the time. This * also allows us to assert that L2ARC data is properly encrypted to * match the data in the main storage pool. */ uint8_t b_mac[ZIO_DATA_MAC_LEN]; } arc_buf_hdr_crypt_t; typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ uint32_t b_hits; arc_state_type_t b_arcs_state; list_node_t b_l2node; } l2arc_buf_hdr_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ /* in-flight list of log blocks */ list_t l2wcb_abd_list; } l2arc_write_callback_t; struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; arc_buf_contents_t b_type; uint8_t b_complevel; - uint8_t b_reserved1; /* used for 4 byte alignment */ - uint16_t b_reserved2; /* used for 4 byte alignment */ + uint8_t b_reserved1; /* used for 4 byte alignment */ + uint16_t b_l2size; /* alignment or L2-only size */ arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; /* * This field stores the size of the data buffer after * compression, and is set in the arc's zio completion handlers. * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). * * While the block pointers can store up to 32MB in their psize * field, we can only store up to 32MB minus 512B. This is due * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. * a field of zeros represents 512B in the bp). We can't use a * bias of 1 since we need to reserve a psize of zero, here, to * represent holes and embedded blocks. * * This isn't a problem in practice, since the maximum size of a * buffer is limited to 16MB, so we never need to store 32MB in * this field. Even in the upstream illumos code base, the * maximum size of a buffer is limited to 16MB. */ uint16_t b_psize; /* * This field stores the size of the data buffer before * compression, and cannot change once set. It is in units * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) */ uint16_t b_lsize; /* immutable */ uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; /* L1ARC fields. Undefined when in l2arc_only state */ l1arc_buf_hdr_t b_l1hdr; /* * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED * is set and the L1 header exists. */ arc_buf_hdr_crypt_t b_crypt_hdr; }; typedef struct arc_stats { /* Number of requests that were satisfied without I/O. */ kstat_named_t arcstat_hits; /* Number of requests for which I/O was already running. */ kstat_named_t arcstat_iohits; /* Number of requests for which I/O has to be issued. */ kstat_named_t arcstat_misses; /* Same three, but specifically for demand data. */ kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_iohits; kstat_named_t arcstat_demand_data_misses; /* Same three, but specifically for demand metadata. */ kstat_named_t arcstat_demand_metadata_hits; kstat_named_t arcstat_demand_metadata_iohits; kstat_named_t arcstat_demand_metadata_misses; /* Same three, but specifically for prefetch data. */ kstat_named_t arcstat_prefetch_data_hits; kstat_named_t arcstat_prefetch_data_iohits; kstat_named_t arcstat_prefetch_data_misses; /* Same three, but specifically for prefetch metadata. */ kstat_named_t arcstat_prefetch_metadata_hits; kstat_named_t arcstat_prefetch_metadata_iohits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_uncached_hits; kstat_named_t arcstat_deleted; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held * by something using the same buffer, since hash locks are shared * by multiple buffers. */ kstat_named_t arcstat_mutex_miss; /* * Number of buffers skipped when updating the access state due to the * header having already been released after acquiring the hash lock. */ kstat_named_t arcstat_access_skip; /* * Number of buffers skipped because they have I/O in progress, are * indirect prefetch buffers that have not lived long enough, or are * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; /* * Number of times arc_evict_state() was unable to evict enough * buffers to reach its target amount. */ kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_eligible_mfu; kstat_named_t arcstat_evict_l2_eligible_mru; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; kstat_named_t arcstat_meta; kstat_named_t arcstat_pd; kstat_named_t arcstat_pm; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. * Note that the compressed bytes may match the uncompressed bytes * if the block is either not compressed or compressed arc is disabled. */ kstat_named_t arcstat_compressed_size; /* * Uncompressed size of the data stored in b_pabd. If compressed * arc is disabled then this value will be identical to the stat * above. */ kstat_named_t arcstat_uncompressed_size; /* * Number of bytes stored in all the arc_buf_t's. This is classified * as "overhead" since this data is typically short-lived and will * be evicted from the arc when it becomes unreferenced unless the * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level * values have been set (see comment in dbuf.c for more information). */ kstat_named_t arcstat_overhead_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually * backed by ARC buffers. This includes arc_buf_hdr_t structures * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only * caches), and arc_buf_t structures (allocated via arc_buf_t * cache). */ kstat_named_t arcstat_hdr_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_DATA. This is generally consumed by buffers backing * on disk user data (e.g. plain file contents). */ kstat_named_t arcstat_data_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_METADATA. This is generally consumed by buffers * backing on disk data that is used for internal ZFS * structures (e.g. ZAP, dnode, indirect blocks, etc). */ kstat_named_t arcstat_metadata_size; /* * Number of bytes consumed by dmu_buf_impl_t objects. */ kstat_named_t arcstat_dbuf_size; /* * Number of bytes consumed by dnode_t objects. */ kstat_named_t arcstat_dnode_size; /* * Number of bytes consumed by bonus buffers. */ kstat_named_t arcstat_bonus_size; #if defined(COMPAT_FREEBSD11) /* * Sum of the previous three counters, provided for compatibility. */ kstat_named_t arcstat_other_size; #endif /* * Total number of bytes consumed by ARC buffers residing in the * arc_anon state. This includes *all* buffers in the arc_anon * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_anon_size; kstat_named_t arcstat_anon_data; kstat_named_t arcstat_anon_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mru state. This includes *all* buffers in the arc_mru * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mru_size; kstat_named_t arcstat_mru_data; kstat_named_t arcstat_mru_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mru_ghost state. The key thing to note * here, is the fact that this size doesn't actually indicate * RAM consumption. The ghost lists only consist of headers and * don't actually have ARC buffers linked off of these headers. * Thus, *if* the headers had associated ARC buffers, these * buffers *would have* consumed this number of bytes. */ kstat_named_t arcstat_mru_ghost_size; kstat_named_t arcstat_mru_ghost_data; kstat_named_t arcstat_mru_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mfu state. This includes *all* buffers in the arc_mfu * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mfu_size; kstat_named_t arcstat_mfu_data; kstat_named_t arcstat_mfu_metadata; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu * state. */ kstat_named_t arcstat_mfu_evictable_data; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_METADATA, and reside in the * arc_mfu state. */ kstat_named_t arcstat_mfu_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mfu_ghost state. See the comment above * arcstat_mru_ghost_size for more details. */ kstat_named_t arcstat_mfu_ghost_size; kstat_named_t arcstat_mfu_ghost_data; kstat_named_t arcstat_mfu_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_metadata; /* * Total number of bytes that are going to be evicted from ARC due to * ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_size; kstat_named_t arcstat_uncached_data; kstat_named_t arcstat_uncached_metadata; /* * Number of data bytes that are going to be evicted from ARC due to * ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_evictable_data; /* * Number of metadata bytes that that are going to be evicted from ARC * due to ARC_FLAG_UNCACHED being set. */ kstat_named_t arcstat_uncached_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; /* * Allocated size (in bytes) of L2ARC cached buffers by ARC state. */ kstat_named_t arcstat_l2_prefetch_asize; kstat_named_t arcstat_l2_mru_asize; kstat_named_t arcstat_l2_mfu_asize; /* * Allocated size (in bytes) of L2ARC cached buffers by buffer content * type. */ kstat_named_t arcstat_l2_bufc_data_asize; kstat_named_t arcstat_l2_bufc_metadata_asize; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_lsize; kstat_named_t arcstat_l2_psize; kstat_named_t arcstat_l2_hdr_size; /* * Number of L2ARC log blocks written. These are used for restoring the * L2ARC. Updated during writing of L2ARC log blocks. */ kstat_named_t arcstat_l2_log_blk_writes; /* * Moving average of the aligned size of the L2ARC log blocks, in * bytes. Updated during L2ARC rebuild and during writing of L2ARC * log blocks. */ kstat_named_t arcstat_l2_log_blk_avg_asize; /* Aligned size of L2ARC log blocks on L2ARC devices. */ kstat_named_t arcstat_l2_log_blk_asize; /* Number of L2ARC log blocks present on L2ARC devices. */ kstat_named_t arcstat_l2_log_blk_count; /* * Moving average of the aligned size of L2ARC restored data, in bytes, * to the aligned size of their metadata in L2ARC, in bytes. * Updated during L2ARC rebuild and during writing of L2ARC log blocks. */ kstat_named_t arcstat_l2_data_to_meta_ratio; /* * Number of times the L2ARC rebuild was successful for an L2ARC device. */ kstat_named_t arcstat_l2_rebuild_success; /* * Number of times the L2ARC rebuild failed because the device header * was in an unsupported format or corrupted. */ kstat_named_t arcstat_l2_rebuild_abort_unsupported; /* * Number of times the L2ARC rebuild failed because of IO errors * while reading a log block. */ kstat_named_t arcstat_l2_rebuild_abort_io_errors; /* * Number of times the L2ARC rebuild failed because of IO errors when * reading the device header. */ kstat_named_t arcstat_l2_rebuild_abort_dh_errors; /* * Number of L2ARC log blocks which failed to be restored due to * checksum errors. */ kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; /* * Number of times the L2ARC rebuild was aborted due to low system * memory. */ kstat_named_t arcstat_l2_rebuild_abort_lowmem; /* Logical size of L2ARC restored data, in bytes. */ kstat_named_t arcstat_l2_rebuild_size; /* Aligned size of L2ARC restored data, in bytes. */ kstat_named_t arcstat_l2_rebuild_asize; /* * Number of L2ARC log entries (buffers) that were successfully * restored in ARC. */ kstat_named_t arcstat_l2_rebuild_bufs; /* * Number of L2ARC log entries (buffers) already cached in ARC. These * were not restored again. */ kstat_named_t arcstat_l2_rebuild_bufs_precached; /* * Number of L2ARC log blocks that were restored successfully. Each * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. */ kstat_named_t arcstat_l2_rebuild_log_blks; kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_memory_direct_count; kstat_named_t arcstat_memory_indirect_count; kstat_named_t arcstat_memory_all_bytes; kstat_named_t arcstat_memory_free_bytes; kstat_named_t arcstat_memory_available_bytes; kstat_named_t arcstat_no_grow; kstat_named_t arcstat_tempreserve; kstat_named_t arcstat_loaned_bytes; kstat_named_t arcstat_prune; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_dnode_limit; kstat_named_t arcstat_async_upgrade_sync; /* Number of predictive prefetch requests. */ kstat_named_t arcstat_predictive_prefetch; /* Number of requests for which predictive prefetch has completed. */ kstat_named_t arcstat_demand_hit_predictive_prefetch; /* Number of requests for which predictive prefetch was running. */ kstat_named_t arcstat_demand_iohit_predictive_prefetch; /* Number of prescient prefetch requests. */ kstat_named_t arcstat_prescient_prefetch; /* Number of requests for which prescient prefetch has completed. */ kstat_named_t arcstat_demand_hit_prescient_prefetch; /* Number of requests for which prescient prefetch was running. */ kstat_named_t arcstat_demand_iohit_prescient_prefetch; kstat_named_t arcstat_need_free; kstat_named_t arcstat_sys_free; kstat_named_t arcstat_raw_size; kstat_named_t arcstat_cached_only_in_progress; kstat_named_t arcstat_abd_chunk_waste_size; } arc_stats_t; typedef struct arc_sums { wmsum_t arcstat_hits; wmsum_t arcstat_iohits; wmsum_t arcstat_misses; wmsum_t arcstat_demand_data_hits; wmsum_t arcstat_demand_data_iohits; wmsum_t arcstat_demand_data_misses; wmsum_t arcstat_demand_metadata_hits; wmsum_t arcstat_demand_metadata_iohits; wmsum_t arcstat_demand_metadata_misses; wmsum_t arcstat_prefetch_data_hits; wmsum_t arcstat_prefetch_data_iohits; wmsum_t arcstat_prefetch_data_misses; wmsum_t arcstat_prefetch_metadata_hits; wmsum_t arcstat_prefetch_metadata_iohits; wmsum_t arcstat_prefetch_metadata_misses; wmsum_t arcstat_mru_hits; wmsum_t arcstat_mru_ghost_hits; wmsum_t arcstat_mfu_hits; wmsum_t arcstat_mfu_ghost_hits; wmsum_t arcstat_uncached_hits; wmsum_t arcstat_deleted; wmsum_t arcstat_mutex_miss; wmsum_t arcstat_access_skip; wmsum_t arcstat_evict_skip; wmsum_t arcstat_evict_not_enough; wmsum_t arcstat_evict_l2_cached; wmsum_t arcstat_evict_l2_eligible; wmsum_t arcstat_evict_l2_eligible_mfu; wmsum_t arcstat_evict_l2_eligible_mru; wmsum_t arcstat_evict_l2_ineligible; wmsum_t arcstat_evict_l2_skip; wmsum_t arcstat_hash_elements; wmsum_t arcstat_hash_collisions; wmsum_t arcstat_hash_chains; aggsum_t arcstat_size; wmsum_t arcstat_compressed_size; wmsum_t arcstat_uncompressed_size; wmsum_t arcstat_overhead_size; wmsum_t arcstat_hdr_size; wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; wmsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; wmsum_t arcstat_l2_prefetch_asize; wmsum_t arcstat_l2_mru_asize; wmsum_t arcstat_l2_mfu_asize; wmsum_t arcstat_l2_bufc_data_asize; wmsum_t arcstat_l2_bufc_metadata_asize; wmsum_t arcstat_l2_feeds; wmsum_t arcstat_l2_rw_clash; wmsum_t arcstat_l2_read_bytes; wmsum_t arcstat_l2_write_bytes; wmsum_t arcstat_l2_writes_sent; wmsum_t arcstat_l2_writes_done; wmsum_t arcstat_l2_writes_error; wmsum_t arcstat_l2_writes_lock_retry; wmsum_t arcstat_l2_evict_lock_retry; wmsum_t arcstat_l2_evict_reading; wmsum_t arcstat_l2_evict_l1cached; wmsum_t arcstat_l2_free_on_write; wmsum_t arcstat_l2_abort_lowmem; wmsum_t arcstat_l2_cksum_bad; wmsum_t arcstat_l2_io_error; wmsum_t arcstat_l2_lsize; wmsum_t arcstat_l2_psize; aggsum_t arcstat_l2_hdr_size; wmsum_t arcstat_l2_log_blk_writes; wmsum_t arcstat_l2_log_blk_asize; wmsum_t arcstat_l2_log_blk_count; wmsum_t arcstat_l2_rebuild_success; wmsum_t arcstat_l2_rebuild_abort_unsupported; wmsum_t arcstat_l2_rebuild_abort_io_errors; wmsum_t arcstat_l2_rebuild_abort_dh_errors; wmsum_t arcstat_l2_rebuild_abort_cksum_lb_errors; wmsum_t arcstat_l2_rebuild_abort_lowmem; wmsum_t arcstat_l2_rebuild_size; wmsum_t arcstat_l2_rebuild_asize; wmsum_t arcstat_l2_rebuild_bufs; wmsum_t arcstat_l2_rebuild_bufs_precached; wmsum_t arcstat_l2_rebuild_log_blks; wmsum_t arcstat_memory_throttle_count; wmsum_t arcstat_memory_direct_count; wmsum_t arcstat_memory_indirect_count; wmsum_t arcstat_prune; wmsum_t arcstat_meta_used; wmsum_t arcstat_async_upgrade_sync; wmsum_t arcstat_predictive_prefetch; wmsum_t arcstat_demand_hit_predictive_prefetch; wmsum_t arcstat_demand_iohit_predictive_prefetch; wmsum_t arcstat_prescient_prefetch; wmsum_t arcstat_demand_hit_prescient_prefetch; wmsum_t arcstat_demand_iohit_prescient_prefetch; wmsum_t arcstat_raw_size; wmsum_t arcstat_cached_only_in_progress; wmsum_t arcstat_abd_chunk_waste_size; } arc_sums_t; typedef struct arc_evict_waiter { list_node_t aew_node; kcondvar_t aew_cv; uint64_t aew_count; } arc_evict_waiter_t; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ wmsum_add(&arc_sums.stat, (val)) #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ #define arc_meta ARCSTAT(arcstat_meta) /* target frac of metadata */ #define arc_pd ARCSTAT(arcstat_pd) /* target frac of data MRU */ #define arc_pm ARCSTAT(arcstat_pm) /* target frac of meta MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ #define arc_anon (&ARC_anon) #define arc_mru (&ARC_mru) #define arc_mru_ghost (&ARC_mru_ghost) #define arc_mfu (&ARC_mfu) #define arc_mfu_ghost (&ARC_mfu_ghost) #define arc_l2c_only (&ARC_l2c_only) #define arc_uncached (&ARC_uncached) extern taskq_t *arc_prune_taskq; extern arc_stats_t arc_stats; extern arc_sums_t arc_sums; extern hrtime_t arc_growtime; extern boolean_t arc_warm; extern uint_t arc_grow_retry; extern uint_t arc_no_grow_shift; extern uint_t arc_shrink_shift; extern kmutex_t arc_prune_mtx; extern list_t arc_prune_list; extern arc_state_t ARC_mfu; extern arc_state_t ARC_mru; extern uint_t zfs_arc_pc_percent; extern uint_t arc_lotsfree_percent; extern uint64_t zfs_arc_min; extern uint64_t zfs_arc_max; extern uint64_t arc_reduce_target_size(uint64_t to_free); extern boolean_t arc_reclaim_needed(void); extern void arc_kmem_reap_soon(void); extern void arc_wait_for_eviction(uint64_t, boolean_t, boolean_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); extern uint64_t arc_free_memory(void); extern int64_t arc_available_memory(void); extern void arc_tuning_update(boolean_t); extern void arc_register_hotplug(void); extern void arc_unregister_hotplug(void); extern int param_set_arc_u64(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS); /* used in zdb.c */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); /* used in vdev_trim.c */ void l2arc_dev_hdr_update(l2arc_dev_t *dev); l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); #ifdef __cplusplus } #endif #endif /* _SYS_ARC_IMPL_H */ diff --git a/include/sys/spa.h b/include/sys/spa.h index 510d1119bffd..23c20294d1f8 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1,1278 +1,1279 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Datto Inc. */ #ifndef _SYS_SPA_H #define _SYS_SPA_H #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Forward references that lots of things need. */ typedef struct brt_vdev brt_vdev_t; typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct metaslab_group metaslab_group_t; typedef struct metaslab_class metaslab_class_t; typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct zbookmark_phys zbookmark_phys_t; typedef struct zbookmark_err_phys zbookmark_err_phys_t; struct bpobj; struct bplist; struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; /* * Alignment Shift (ashift) is an immutable, internal top-level vdev property * which can only be set at vdev creation time. Physical writes are always done * according to it, which makes 2^ashift the smallest possible IO on a vdev. * * We currently allow values ranging from 512 bytes (2^9 = 512) to 64 KiB * (2^16 = 65,536). */ #define ASHIFT_MIN 9 #define ASHIFT_MAX 16 /* * Size of block to hold the configuration data (a packed nvlist) */ #define SPA_CONFIG_BLOCKSIZE (1ULL << 14) /* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. * The ASIZE encoding should be at least 64 times larger (6 more bits) * to support up to 4-way RAID-Z mirror mode with worst-case gang block * overhead, three DVAs per bp, plus one more bit in case we do anything * else that expands the ASIZE. */ #define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ #define SPA_COMPRESSBITS 7 #define SPA_VDEVBITS 24 #define SPA_COMPRESSMASK ((1U << SPA_COMPRESSBITS) - 1) /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. */ typedef struct dva { uint64_t dva_word[2]; } dva_t; /* * Some checksums/hashes need a 256-bit initialization salt. This salt is kept * secret and is suitable for use in MAC algorithms as the key. */ typedef struct zio_cksum_salt { uint8_t zcs_bytes[32]; } zio_cksum_salt_t; /* * Each block is described by its DVAs, time of birth, checksum, etc. * The word-by-word, bit-by-bit layout of the blkptr is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | pad | vdev1 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | pad | vdev2 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | pad | vdev3 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | checksum[2] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | checksum[3] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * vdev virtual device ID * offset offset into virtual device * LSIZE logical size * PSIZE physical size (after compression) * ASIZE allocated size (including RAID-Z parity and gang block headers) * cksum checksum function * comp compression function * G gang block indicator * B byteorder (endianness) * D dedup * X encryption * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type * phys birth txg when dva[0] was written; zero if same as logical birth txg * note that typically all the dva's would be written in this * txg, but they could be different if they were moved by * device removal. * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ /* * The blkptr_t's of encrypted blocks also need to store the encryption * parameters so that the block can be decrypted. This layout is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | vdev1 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | vdev2 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | salt | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 | IV1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | IV2 | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | MAC[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | MAC[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * salt Salt for generating encryption keys * IV1 First 64 bits of encryption IV * X Block requires encryption handling (set to 1) * E blkptr_t contains embedded data (set to 0, see below) * fill count number of non-zero blocks under this bp (truncated to 32 bits) * IV2 Last 32 bits of encryption IV * checksum[2] 128-bit checksum of the data this bp describes * MAC[2] 128-bit message authentication code for this data * * The X bit being set indicates that this block is one of 3 types. If this is * a level 0 block with an encrypted object type, the block is encrypted * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted * object type, this block is authenticated with an HMAC (see * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC * words to store a checksum-of-MACs from the level below (see * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED() * refers to both encrypted and authenticated blocks and BP_USES_CRYPT() * refers to any of these 3 kinds of blocks. * * The additional encryption parameters are the salt, IV, and MAC which are * explained in greater detail in the block comment at the top of zio_crypt.c. * The MAC occupies half of the checksum space since it serves a very similar * purpose: to prevent data corruption on disk. The only functional difference * is that the checksum is used to detect on-disk corruption whether or not the * encryption key is loaded and the MAC provides additional protection against * malicious disk tampering. We use the 3rd DVA to store the salt and first * 64 bits of the IV. As a result encrypted blocks can only have 2 copies * maximum instead of the normal 3. The last 32 bits of the IV are stored in * the upper bits of what is usually the fill count. Note that only blocks at * level 0 or -2 are ever encrypted, which allows us to guarantee that these * 32 bits are not trampled over by other code (see zio_crypt.c for details). * The salt and IV are not used for authenticated bps or bps with an indirect * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits * for the fill count. */ /* * "Embedded" blkptr_t's don't actually point to a block, instead they * have a data payload embedded in the blkptr_t itself. See the comment * in blkptr.c for more details. * * The blkptr_t is laid out as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | payload | * 1 | payload | * 2 | payload | * 3 | payload | * 4 | payload | * 5 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | payload | * 8 | payload | * 9 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | payload | * c | payload | * d | payload | * e | payload | * f | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * payload contains the embedded data * B (byteorder) byteorder (endianness) * D (dedup) padding (set to zero) * X encryption (set to zero) * E (embedded) set to one * lvl indirection level * type DMU object type * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) * comp compression function of payload * PSIZE size of payload after compression, in bytes * LSIZE logical size of payload, in bytes * note that 25 bits is enough to store the largest * "normal" BP's LSIZE (2^16 * 2^9) in bytes * log. birth transaction group in which the block was logically born * * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded * bp's they are stored in units of SPA_MINBLOCKSHIFT. * Generally, the generic BP_GET_*() macros can be used on embedded BP's. * The B, D, X, lvl, type, and comp fields are stored the same as with normal * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before * other macros, as they assert that they are only used on BP's of the correct * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use * the payload space for encryption parameters (see the comment above on * how encryption parameters are stored). */ #define BPE_GET_ETYPE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET((bp)->blk_prop, 40, 8)) #define BPE_SET_ETYPE(bp, t) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, t); \ } while (0) #define BPE_GET_LSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) #define BPE_SET_LSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ } while (0) #define BPE_GET_PSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) #define BPE_SET_PSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ } while (0) typedef enum bp_embedded_type { BP_EMBEDDED_TYPE_DATA, BP_EMBEDDED_TYPE_RESERVED, /* Reserved for Delphix byteswap feature. */ BP_EMBEDDED_TYPE_REDACTED, NUM_BP_EMBEDDED_TYPES } bp_embedded_type_t; #define BPE_NUM_WORDS 14 #define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) #define BPE_IS_PAYLOADWORD(bp, wp) \ ((wp) != &(bp)->blk_prop && (wp) != (&(bp)->blk_birth_word[1])) #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ #define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ /* * A block is a hole when it has either 1) never been written to, or * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads * without physically allocating disk space. Holes are represented in the * blkptr_t structure by zeroed blk_dva. Correct checking for holes is * done through the BP_IS_HOLE macro. For holes, the logical size, level, * DMU object type, and birth times are all also stored for holes that * were written to at some point (i.e. were punched after having been filled). */ typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ uint64_t blk_pad[2]; /* Extra space for the future */ uint64_t blk_birth_word[2]; uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ } blkptr_t; /* * Macros to get and set fields in a bp or DVA. */ /* * Note, for gang blocks, DVA_GET_ASIZE() is the total space allocated for * this gang DVA including its children BP's. The space allocated at this * DVA's vdev/offset is vdev_gang_header_asize(vdev). */ #define DVA_GET_ASIZE(dva) \ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_ASIZE(dva, x) \ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) #define DVA_SET_VDEV(dva, x) \ BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) #define DVA_GET_OFFSET(dva) \ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_OFFSET(dva, x) \ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? \ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_LSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ } while (0) #define BP_GET_PSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_PSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ } while (0) #define BP_GET_COMPRESS(bp) \ BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) #define BP_SET_COMPRESS(bp, x) \ BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) #define BP_GET_CHECKSUM(bp) \ (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ BF64_GET((bp)->blk_prop, 40, 8)) #define BP_SET_CHECKSUM(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, x); \ } while (0) #define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) #define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) /* encrypted, authenticated, and MAC cksum bps use the same bit */ #define BP_USES_CRYPT(bp) BF64_GET((bp)->blk_prop, 61, 1) #define BP_SET_CRYPT(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) #define BP_IS_ENCRYPTED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_IS_AUTHENTICATED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ !DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_HAS_INDIRECT_MAC_CKSUM(bp) \ (BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0) #define BP_IS_PROTECTED(bp) \ (BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp)) #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) #define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) #define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) #define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) #define BP_GET_LOGICAL_BIRTH(bp) (bp)->blk_birth_word[1] #define BP_SET_LOGICAL_BIRTH(bp, x) ((bp)->blk_birth_word[1] = (x)) #define BP_GET_PHYSICAL_BIRTH(bp) (bp)->blk_birth_word[0] #define BP_SET_PHYSICAL_BIRTH(bp, x) ((bp)->blk_birth_word[0] = (x)) #define BP_GET_BIRTH(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) : \ BP_GET_LOGICAL_BIRTH(bp)) #define BP_SET_BIRTH(bp, logical, physical) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BP_SET_LOGICAL_BIRTH(bp, logical); \ BP_SET_PHYSICAL_BIRTH(bp, \ ((logical) == (physical) ? 0 : (physical))); \ } #define BP_GET_FILL(bp) \ ((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \ ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)) #define BP_SET_FILL(bp, fill) \ { \ if (BP_IS_ENCRYPTED(bp)) \ BF64_SET((bp)->blk_fill, 0, 32, fill); \ else \ (bp)->blk_fill = fill; \ } #define BP_GET_IV2(bp) \ (ASSERT(BP_IS_ENCRYPTED(bp)), \ BF64_GET((bp)->blk_fill, 32, 32)) #define BP_SET_IV2(bp, iv2) \ { \ ASSERT(BP_IS_ENCRYPTED(bp)); \ BF64_SET((bp)->blk_fill, 32, 32, iv2); \ } #define BP_IS_METADATA(bp) \ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_GET_UCSIZE(bp) \ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_COUNT_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ DVA_GET_GANG(&(bp)->blk_dva[1]) + \ (DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))) #define DVA_EQUAL(dva1, dva2) \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ (BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) && \ BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) #define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) #define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) #define BP_IS_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) #define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ (dva)->dva_word[1] == 0ULL) #define BP_IS_HOLE(bp) \ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) #define BP_SET_REDACTED(bp) \ { \ BP_SET_EMBEDDED(bp, B_TRUE); \ BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED); \ } #define BP_IS_REDACTED(bp) \ (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED) /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) #define BP_ZERO_DVAS(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ (bp)->blk_dva[1].dva_word[0] = 0; \ (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ } #define BP_ZERO(bp) \ { \ BP_ZERO_DVAS(bp); \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ (bp)->blk_birth_word[0] = 0; \ (bp)->blk_birth_word[1] = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } #ifdef _ZFS_BIG_ENDIAN #define ZFS_HOST_BYTEORDER (0ULL) #else #define ZFS_HOST_BYTEORDER (1ULL) #endif #define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) #define BP_SPRINTF_LEN 400 /* * This macro allows code sharing between zfs, libzpool, and mdb. * 'func' is either kmem_scnprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ #define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *const copyname[] = \ { "zero", "single", "double", "triple" }; \ int len = 0; \ int copies = 0; \ const char *crypt_type; \ if (bp != NULL) { \ if (BP_IS_ENCRYPTED(bp)) { \ crypt_type = "encrypted"; \ /* LINTED E_SUSPICIOUS_COMPARISON */ \ } else if (BP_IS_AUTHENTICATED(bp)) { \ crypt_type = "authenticated"; \ } else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { \ crypt_type = "indirect-MAC"; \ } else { \ crypt_type = "unencrypted"; \ } \ } \ if (bp == NULL) { \ len += func(buf + len, size - len, ""); \ } else if (BP_IS_HOLE(bp)) { \ len += func(buf + len, size - len, \ "HOLE [L%llu %s] " \ "size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else if (BP_IS_EMBEDDED(bp)) { \ len = func(buf + len, size - len, \ "EMBEDDED [L%llu %s] et=%u %s " \ "size=%llxL/%llxP birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (int)BPE_GET_ETYPE(bp), \ compress, \ (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else if (BP_IS_REDACTED(bp)) { \ len += func(buf + len, size - len, \ "REDACTED [L%llu %s] size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ if (DVA_IS_VALID(dva)) \ copies++; \ len += func(buf + len, size - len, \ "DVA[%d]=<%llu:%llx:%llx>%c", d, \ (u_longlong_t)DVA_GET_VDEV(dva), \ (u_longlong_t)DVA_GET_OFFSET(dva), \ (u_longlong_t)DVA_GET_ASIZE(dva), \ ws); \ } \ ASSERT3S(copies, >, 0); \ if (BP_IS_ENCRYPTED(bp)) { \ len += func(buf + len, size - len, \ "salt=%llx iv=%llx:%llx%c", \ (u_longlong_t)bp->blk_dva[2].dva_word[0], \ (u_longlong_t)bp->blk_dva[2].dva_word[1], \ (u_longlong_t)BP_GET_IV2(bp), \ ws); \ } \ if (BP_IS_GANG(bp) && \ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ copies--; \ len += func(buf + len, size - len, \ "[L%llu %s] %s %s %s %s %s %s %s%c" \ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ "cksum=%016llx:%016llx:%016llx:%016llx", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ checksum, \ compress, \ crypt_type, \ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ BP_IS_GANG(bp) ? "gang" : "contiguous", \ BP_GET_DEDUP(bp) ? "dedup" : "unique", \ copyname[copies], \ ws, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), \ (u_longlong_t)BP_GET_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ (u_longlong_t)bp->blk_cksum.zc_word[1], \ (u_longlong_t)bp->blk_cksum.zc_word[2], \ (u_longlong_t)bp->blk_cksum.zc_word[3]); \ } \ ASSERT(len < size); \ } #define BP_GET_BUFC_TYPE(bp) \ (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, SPA_IMPORT_ASSEMBLE } spa_import_type_t; typedef enum spa_mode { SPA_MODE_UNINIT = 0, SPA_MODE_READ = 1, SPA_MODE_WRITE = 2, } spa_mode_t; /* * Send TRIM commands in-line during normal pool operation while deleting. * OFF: no * ON: yes */ typedef enum { SPA_AUTOTRIM_OFF = 0, /* default */ SPA_AUTOTRIM_ON, } spa_autotrim_t; /* * Reason TRIM command was issued, used internally for accounting purposes. */ typedef enum trim_type { TRIM_TYPE_MANUAL = 0, TRIM_TYPE_AUTO = 1, TRIM_TYPE_SIMPLE = 2 } trim_type_t; /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, const void *tag); extern int spa_open_rewind(const char *pool, spa_t **, const void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, struct dsl_crypto_params *dcp); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(const char *pool); extern int spa_checkpoint(const char *pool); extern int spa_checkpoint_discard(const char *pool); extern int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce); extern int spa_reset(const char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern int spa_async_tasks(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 #define SPA_ASYNC_FAULT_VDEV 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 #define SPA_ASYNC_INITIALIZE_RESTART 0x100 #define SPA_ASYNC_TRIM_RESTART 0x200 #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 #define SPA_ASYNC_L2CACHE_REBUILD 0x800 #define SPA_ASYNC_L2CACHE_TRIM 0x1000 #define SPA_ASYNC_REBUILD_DONE 0x2000 #define SPA_ASYNC_DETACH_SPARE 0x4000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t ashift_check); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_alloc(spa_t *spa, uint64_t guid); extern int spa_vdev_noalloc(spa_t *spa, uint64_t guid); extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist); extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); extern void spa_spare_remove(vdev_t *vd); extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); extern void spa_spare_activate(vdev_t *vd); /* L2ARC state (which is global across all pools) */ extern void spa_l2cache_add(vdev_t *vd); extern void spa_l2cache_remove(vdev_t *vd); extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); /* scanning */ extern int spa_scan(spa_t *spa, pool_scan_func_t func); extern int spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart, uint64_t txgend); extern int spa_scan_stop(spa_t *spa); extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); extern uint_t zfs_sync_pass_deferred_free; /* spa sync taskqueues */ taskq_t *spa_sync_tq_create(spa_t *spa, const char *name); void spa_sync_tq_destroy(spa_t *spa); uint_t spa_acq_allocator(spa_t *spa); void spa_rel_allocator(spa_t *spa, uint_t allocator); void spa_select_allocator(zio_t *zio); /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; extern avl_tree_t spa_namespace_avl; extern kcondvar_t spa_namespace_cv; /* * SPA configuration functions in spa_config.c */ #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern void spa_config_load(void); extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype); /* * Miscellaneous SPA routines in spa_misc.c */ /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, const void *tag); extern void spa_close(spa_t *spa, const void *tag); extern void spa_async_close(spa_t *spa, const void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ #define SCL_ALLOC 0x08 #define SCL_ZIO 0x10 #define SCL_FREE 0x20 #define SCL_VDEV 0x40 #define SCL_LOCKS 7 #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) /* Historical pool statistics */ typedef struct spa_history_kstat { kmutex_t lock; uint64_t count; uint64_t size; kstat_t *kstat; void *priv; list_t list; } spa_history_kstat_t; typedef struct spa_history_list { uint64_t size; procfs_list_t procfs_list; } spa_history_list_t; typedef struct spa_stats { spa_history_list_t read_history; spa_history_list_t txg_history; spa_history_kstat_t tx_assign_histogram; spa_history_list_t mmp_history; spa_history_kstat_t state; /* pool state */ spa_history_kstat_t guid; /* pool guid */ spa_history_kstat_t iostats; } spa_stats_t; typedef enum txg_state { TXG_STATE_BIRTH = 0, TXG_STATE_OPEN = 1, TXG_STATE_QUIESCED = 2, TXG_STATE_WAIT_FOR_SYNC = 3, TXG_STATE_SYNCED = 4, TXG_STATE_COMMITTED = 5, } txg_state_t; typedef struct txg_stat { vdev_stat_t vs1; vdev_stat_t vs2; uint64_t txg; uint64_t ndirty; } txg_stat_t; /* Assorted pool IO kstats */ typedef struct spa_iostats { kstat_named_t trim_extents_written; kstat_named_t trim_bytes_written; kstat_named_t trim_extents_skipped; kstat_named_t trim_bytes_skipped; kstat_named_t trim_extents_failed; kstat_named_t trim_bytes_failed; kstat_named_t autotrim_extents_written; kstat_named_t autotrim_bytes_written; kstat_named_t autotrim_extents_skipped; kstat_named_t autotrim_bytes_skipped; kstat_named_t autotrim_extents_failed; kstat_named_t autotrim_bytes_failed; kstat_named_t simple_trim_extents_written; kstat_named_t simple_trim_bytes_written; kstat_named_t simple_trim_extents_skipped; kstat_named_t simple_trim_bytes_skipped; kstat_named_t simple_trim_extents_failed; kstat_named_t simple_trim_bytes_failed; kstat_named_t arc_read_count; kstat_named_t arc_read_bytes; kstat_named_t arc_write_count; kstat_named_t arc_write_bytes; kstat_named_t direct_read_count; kstat_named_t direct_read_bytes; kstat_named_t direct_write_count; kstat_named_t direct_write_bytes; } spa_iostats_t; extern void spa_stats_init(spa_t *spa); extern void spa_stats_destroy(spa_t *spa); extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags); extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); extern int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time); extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t, struct dsl_pool *); extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *); extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id); extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, hrtime_t duration); extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id, int error); extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, uint64_t mmp_sec_remaining); extern int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t max_txg); extern int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t spa_load_state); extern void spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...) __printflike(2, 3); extern void spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...) __printflike(2, 3); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, const void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, const char *tag); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); /* Log state */ typedef enum spa_log_state { SPA_LOG_UNKNOWN = 0, /* unknown log state */ SPA_LOG_MISSING, /* missing log(s) */ SPA_LOG_CLEAR, /* clear the log(s) */ SPA_LOG_GOOD, /* log(s) are good */ } spa_log_state_t; extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); extern int spa_reset_logs(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); extern void spa_deadman(void *); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern boolean_t spa_is_initializing(spa_t *spa); extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); extern uint32_t spa_sync_pass(spa_t *spa); extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_load_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, const zio_t *zio); extern boolean_t spa_special_has_ddt(spa_t *spa); extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern uint64_t spa_get_failmode(spa_t *spa); extern uint64_t spa_get_deadman_failmode(spa_t *spa); extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode); extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_get_last_scrubbed_txg(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); extern space_map_t *spa_syncing_log_sm(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); extern uint64_t spa_deadman_ziotime(spa_t *spa); extern uint64_t spa_dirty_data(spa_t *spa); extern spa_autotrim_t spa_get_autotrim(spa_t *spa); extern int spa_get_allocator(spa_t *spa); extern void spa_set_allocator(spa_t *spa, const char *allocator); /* Miscellaneous support routines */ extern void spa_load_failed(spa_t *spa, const char *fmt, ...) __attribute__((format(printf, 2, 3))); extern void spa_load_note(spa_t *spa, const char *fmt, ...) __attribute__((format(printf, 2, 3))); extern void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx); extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_generate_guid(spa_t *spa); +extern uint64_t spa_generate_load_guid(void); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern int spa_change_guid(spa_t *spa, const uint64_t *guidp); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_l2cache(spa_t *, uint64_t guid); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); extern boolean_t spa_has_checkpoint(spa_t *spa); extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); extern boolean_t spa_suspend_async_destroy(spa_t *spa); extern uint64_t spa_min_claim_txg(spa_t *spa); extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp); typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, void *arg); extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg); extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); extern uint64_t spa_total_metaslabs(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern boolean_t spa_livelist_delete_check(spa_t *spa); extern boolean_t spa_mmp_remote_host_activity(spa_t *spa); extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); extern char *spa_his_ievent_table[]; extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf); extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); extern void spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx); extern void spa_history_log_internal(spa_t *spa, const char *operation, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth); extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, zio_t *zio); extern void zfs_ereport_taskq_fini(void); extern void zfs_ereport_clear(spa_t *spa, vdev_t *vd); extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_approx_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count); extern uint64_t spa_get_last_errlog_size(spa_t *spa); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); extern void spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx); extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, dmu_tx_t *tx); extern void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx); extern void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx); extern int find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t *top_affected_fs); extern int find_birth_txg(struct dsl_dataset *ds, zbookmark_err_phys_t *zep, uint64_t *birth_txg); extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb); extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep); /* vdev mirror */ extern void vdev_mirror_stat_init(void); extern void vdev_mirror_stat_fini(void); /* Initialization and termination */ extern void spa_init(spa_mode_t mode); extern void spa_fini(void); extern void spa_boot_init(void); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, nvlist_t *outnvl); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, const char *name); extern void zfs_ereport_zvol_post(const char *subclass, const char *name, const char *device_name, const char *raw_name); /* waiting for pool activities to complete */ extern int spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited); extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited); extern void spa_notify_waiters(spa_t *spa); extern void spa_wake_waiters(spa_t *spa); extern void spa_import_os(spa_t *spa); extern void spa_export_os(spa_t *spa); extern void spa_activate_os(spa_t *spa); extern void spa_deactivate_os(spa_t *spa); /* module param call functions */ int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS); int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS); int param_set_active_allocator(ZFS_MODULE_PARAM_ARGS); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ } while (0) #else #define dprintf_bp(bp, fmt, ...) #endif extern spa_mode_t spa_mode_global; extern int zfs_deadman_enabled; extern uint64_t zfs_deadman_synctime_ms; extern uint64_t zfs_deadman_ziotime_ms; extern uint64_t zfs_deadman_checktime_ms; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; #ifdef __cplusplus } #endif #endif /* _SYS_SPA_H */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index fa7baac04b7b..bd6f076dfbbd 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1,10883 +1,11096 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2020, Delphix. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. - * Copyright (c) 2019, 2023, Klara Inc. + * Copyright (c) 2019, 2024, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020, The FreeBSD Foundation [1] * Copyright (c) 2021, 2024 by George Melikov. All rights reserved. * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (ranging from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the * metadata limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so * they can be reclaimed. For example, when using the ZPL each dentry * holds a references on a znode. These dentries must be pruned before * the arc buffer holding the znode can be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ /* * ARC operation: * * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. * This structure can point either to a block that is still in the cache or to * one that is only accessible in an L2 ARC device, or it can provide * information about a block that was recently evicted. If a block is * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough * information to retrieve it from the L2ARC device. This information is * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block * that is in this state cannot access the data directly. * * Blocks that are actively being referenced or have not been evicted * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the * ability to store the physical data (b_pabd) associated with the DVA of the * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC * consumer. The ARC will provide references to this data and will keep it * cached until it is no longer in use. The ARC caches only the L1ARC's physical * data block and will evict any arc_buf_t that is no longer referenced. The * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the * "overhead_size" kstat. * * Depending on the consumer, an arc_buf_t can be requested in uncompressed or * compressed form. The typical case is that consumers will want uncompressed * data, and when that happens a new data buffer is allocated where the data is * decompressed for them to use. Currently the only consumer who wants * compressed arc_buf_t's is "zfs send", when it streams data exactly as it * exists on disk. When this happens, the arc_buf_t's data buffer is shared * with the arc_buf_hdr_t. * * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The * first one is owned by a compressed send consumer (and therefore references * the same compressed data buffer as the arc_buf_hdr_t) and the second could be * used by any other consumer (and has its own uncompressed copy of the data * buffer). * * arc_buf_hdr_t * +-----------+ * | fields | * | common to | * | L1- and | * | L2ARC | * +-----------+ * | l2arc_buf_hdr_t * | | * +-----------+ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | * | +-----------+ | |b_data +-+ * +->+------+ | +-----------+ | * compressed | | | | * data | |<--------------+ | uncompressed * +------+ compressed, | data * shared +-->+------+ * data | | * | | * +------+ * * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be * the last buffer in the hdr's b_buf list, however a shared compressed buf can * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is * sharing its data with an arc_buf_t (note that the shared uncompressed buf is * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ * | | * | | * | | * +-----------+ * l2arc_buf_hdr_t| | * | | * +-----------+ * l1arc_buf_hdr_t| | * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ * +->+------+ | +---------+ | * | | | | * uncompressed | | | | * data +------+ | | * ^ +->+------+ | * | uncompressed | | | * | data | | | * | +------+ | * +---------------------------------+ * * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will memcpy the transformed on-disk block into * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pabd. The * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. * * The L1ARC has a slightly different system for storing encrypted data. * Raw (encrypted + possibly compressed) data has a few subtle differences from * data that is just compressed. The biggest difference is that it is not * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded. * The other difference is that encryption cannot be treated as a suggestion. * If a caller would prefer compressed data, but they actually wind up with * uncompressed data the worst thing that could happen is there might be a * performance hit. If the caller requests encrypted data, however, we must be * sure they actually get it or else secret information could be leaked. Raw * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore, * may have both an encrypted version and a decrypted version of its data at * once. When a caller needs a raw arc_buf_t, it is allocated and the data is * copied out of this header. To avoid complications with b_pabd, raw buffers * cannot be shared. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; #endif /* * This thread's job is to keep enough free memory in the system, by * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves * arc_available_memory(). */ static zthr_t *arc_reap_zthr; /* * This thread's job is to keep arc_size under arc_c, by calling * arc_evict(), which improves arc_is_overflowing(). */ static zthr_t *arc_evict_zthr; static arc_buf_hdr_t **arc_state_evict_markers; static int arc_state_evict_marker_count; static kmutex_t arc_evict_lock; static boolean_t arc_evict_needed = B_FALSE; static clock_t arc_last_uncached_flush; /* * Count of bytes evicted since boot. */ static uint64_t arc_evict_count; /* * List of arc_evict_waiter_t's, representing threads waiting for the * arc_evict_count to reach specific values. */ static list_t arc_evict_waiters; /* * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of * the requested amount of data to be evicted. For example, by default for * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation. * Since this is above 100%, it ensures that progress is made towards getting * arc_size under arc_c. Since this is finite, it ensures that allocations * can still happen, even during the potentially long time that arc_size is * more than arc_c. */ static uint_t zfs_arc_eviction_pct = 200; /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower * value means we're more likely to evict the "correct" header (i.e. the * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ static uint_t zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ uint_t arc_grow_retry = 5; /* * Minimum time between calls to arc_kmem_reap_soon(). */ static const int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ static int zfs_arc_overflow_shift = 8; /* log2(fraction of arc to reclaim) */ uint_t arc_shrink_shift = 7; /* percent of pagecache to reclaim arc to */ #ifdef _KERNEL uint_t zfs_arc_pc_percent = 0; #endif /* * log2(fraction of ARC which must be free to allow growing). * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ uint_t arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static uint_t arc_min_prefetch_ms; static uint_t arc_min_prescient_prefetch_ms; /* * If this percent of memory is free, don't throttle. */ uint_t arc_lotsfree_percent = 10; /* * The arc has filled available memory and has now warmed up. */ boolean_t arc_warm; /* * These tunables are for performance analysis. */ uint64_t zfs_arc_max = 0; uint64_t zfs_arc_min = 0; static uint64_t zfs_arc_dnode_limit = 0; static uint_t zfs_arc_dnode_reduce_percent = 10; static uint_t zfs_arc_grow_retry = 0; static uint_t zfs_arc_shrink_shift = 0; uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* * ARC dirty data constraints for arc_tempreserve_space() throttle: * * total dirty data limit * * anon block dirty limit * * each pool's anon allowance */ static const unsigned long zfs_arc_dirty_limit_percent = 50; static const unsigned long zfs_arc_anon_limit_percent = 25; static const unsigned long zfs_arc_pool_dirty_percent = 20; /* * Enable or disable compressed arc buffers. */ int zfs_compressed_arc_enabled = B_TRUE; /* * Balance between metadata and data on ghost hits. Values above 100 * increase metadata caching by proportionally reducing effect of ghost * data hits on target data/metadata rate. */ static uint_t zfs_arc_meta_balance = 500; /* * Percentage that can be consumed by dnodes of ARC meta buffers. */ static uint_t zfs_arc_dnode_limit_percent = 10; /* * These tunables are Linux-specific */ static uint64_t zfs_arc_sys_free = 0; static uint_t zfs_arc_min_prefetch_ms = 0; static uint_t zfs_arc_min_prescient_prefetch_ms = 0; static uint_t zfs_arc_lotsfree_percent = 10; /* * Number of arc_prune threads */ static int zfs_arc_prune_task_threads = 1; +/* Used by spa_export/spa_destroy to flush the arc asynchronously */ +static taskq_t *arc_flush_taskq; + /* The 7 states: */ arc_state_t ARC_anon; arc_state_t ARC_mru; arc_state_t ARC_mru_ghost; arc_state_t ARC_mfu; arc_state_t ARC_mfu_ghost; arc_state_t ARC_l2c_only; arc_state_t ARC_uncached; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "iohits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_iohits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_iohits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_iohits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_iohits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "uncached_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "access_skip", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "meta", KSTAT_DATA_UINT64 }, { "pd", KSTAT_DATA_UINT64 }, { "pm", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "compressed_size", KSTAT_DATA_UINT64 }, { "uncompressed_size", KSTAT_DATA_UINT64 }, { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, { "dbuf_size", KSTAT_DATA_UINT64 }, { "dnode_size", KSTAT_DATA_UINT64 }, { "bonus_size", KSTAT_DATA_UINT64 }, #if defined(COMPAT_FREEBSD11) { "other_size", KSTAT_DATA_UINT64 }, #endif { "anon_size", KSTAT_DATA_UINT64 }, { "anon_data", KSTAT_DATA_UINT64 }, { "anon_metadata", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, { "mru_data", KSTAT_DATA_UINT64 }, { "mru_metadata", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, { "mru_ghost_data", KSTAT_DATA_UINT64 }, { "mru_ghost_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, { "mfu_data", KSTAT_DATA_UINT64 }, { "mfu_metadata", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "uncached_size", KSTAT_DATA_UINT64 }, { "uncached_data", KSTAT_DATA_UINT64 }, { "uncached_metadata", KSTAT_DATA_UINT64 }, { "uncached_evictable_data", KSTAT_DATA_UINT64 }, { "uncached_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, { "l2_mru_asize", KSTAT_DATA_UINT64 }, { "l2_mfu_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_data_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_count", KSTAT_DATA_UINT64 }, { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, { "l2_rebuild_success", KSTAT_DATA_UINT64 }, { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, { "l2_rebuild_size", KSTAT_DATA_UINT64 }, { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "memory_all_bytes", KSTAT_DATA_UINT64 }, { "memory_free_bytes", KSTAT_DATA_UINT64 }, { "memory_available_bytes", KSTAT_DATA_INT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, { "arc_tempreserve", KSTAT_DATA_UINT64 }, { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "prescient_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, { "arc_raw_size", KSTAT_DATA_UINT64 }, { "cached_only_in_progress", KSTAT_DATA_UINT64 }, { "abd_chunk_waste_size", KSTAT_DATA_UINT64 }, }; arc_sums_t arc_sums; #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } /* * This macro allows us to use kstats as floating averages. Each time we * update this kstat, we first factor it and the update value by * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall * average. This macro assumes that integer loads and stores are atomic, but * is not safe for multiple writers updating the kstat in parallel (only the * last writer's update will remain). */ #define ARCSTAT_F_AVG_FACTOR 3 #define ARCSTAT_F_AVG(stat, value) \ do { \ uint64_t x = ARCSTAT(stat); \ x = x - x / ARCSTAT_F_AVG_FACTOR + \ (value) / ARCSTAT_F_AVG_FACTOR; \ ARCSTAT(stat) = x; \ } while (0) static kstat_t *arc_ksp; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ hrtime_t arc_growtime; list_t arc_prune_list; kmutex_t arc_prune_mtx; taskq_t *arc_prune_taskq; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_PRESCIENT_PREFETCH(hdr) \ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) #define HDR_UNCACHED(hdr) ((hdr)->b_flags & ARC_FLAG_UNCACHED) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) #define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED) #define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) #define HDR_HAS_RABD(hdr) \ (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \ (hdr)->b_crypt_hdr.b_rabd != NULL) #define HDR_ENCRYPTED(hdr) \ (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) #define HDR_AUTHENTICATED(hdr) \ (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) #define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED) /* * Other sizes */ #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ #define BUF_LOCKS 2048 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; +/* + * Asynchronous ARC flush + * + * We track these in a list for arc_async_flush_guid_inuse(). + * Used for both L1 and L2 async teardown. + */ +static list_t arc_async_flush_list; +static kmutex_t arc_async_flush_lock; + +typedef struct arc_async_flush { + uint64_t af_spa_guid; + taskq_ent_t af_tqent; + uint_t af_cache_level; /* 1 or 2 to differentiate node */ + list_node_t af_node; +} arc_async_flush_t; + + /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (32 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 8 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ /* * We can feed L2ARC from two states of ARC buffers, mru and mfu, * and each of the state has two types: data and metadata. */ #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ static uint_t l2arc_meta_percent = 33; /* limit on headers size */ /* * L2ARC Internals */ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ abd_t *l2rcb_abd; /* temporary buffer */ } l2arc_read_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; typedef enum arc_fill_flags { ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */ ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */ ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ } arc_fill_flags_t; typedef enum arc_ovf_level { ARC_OVF_NONE, /* ARC within target size. */ ARC_OVF_SOME, /* ARC is slightly overflowed. */ ARC_OVF_SEVERE /* ARC is severely overflowed. */ } arc_ovf_level_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static kmutex_t l2arc_rebuild_thr_lock; static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, ARC_HDR_USE_RESERVE = 0x4, ARC_HDR_ALLOC_LINEAR = 0x8, }; static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *); static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); static void arc_hdr_destroy(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t); static void arc_buf_watch(arc_buf_t *); static void arc_change_state(arc_state_t *, arc_buf_hdr_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only); static void arc_prune_async(uint64_t adjust); #define l2arc_hdr_arcstats_increment(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) #define l2arc_hdr_arcstats_decrement(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE) #define l2arc_hdr_arcstats_increment_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE) #define l2arc_hdr_arcstats_decrement_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) /* * l2arc_exclude_special : A zfs module parameter that controls whether buffers * present on special vdevs are eligibile for caching in L2ARC. If * set to 1, exclude dbufs on special vdevs from being cached to * L2ARC. */ int l2arc_exclude_special = 0; /* * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU * metadata and data are cached from ARC into L2ARC. */ static int l2arc_mfuonly = 0; /* * L2ARC TRIM * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of * the current write size (l2arc_write_max) we should TRIM if we * have filled the device. It is defined as a percentage of the * write size. If set to 100 we trim twice the space required to * accommodate upcoming writes. A minimum of 64MB will be trimmed. * It also enables TRIM of the whole L2ARC device upon creation or * addition to an existing pool or if the header of the device is * invalid upon importing a pool or onlining a cache device. The * default is 0, which disables TRIM on L2ARC altogether as it can * put significant stress on the underlying storage devices. This * will vary depending of how well the specific device handles * these commands. */ static uint64_t l2arc_trim_ahead = 0; /* * Performance tuning of L2ARC persistence: * * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding * an L2ARC device (either at pool import or later) will attempt * to rebuild L2ARC buffer contents. * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls * whether log blocks are written to the L2ARC device. If the L2ARC * device is less than 1GB, the amount of data l2arc_evict() * evicts is significant compared to the amount of restored L2ARC * data. In this case do not write log blocks in L2ARC in order * not to waste space. */ static int l2arc_rebuild_enabled = B_TRUE; static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; /* L2ARC persistence rebuild control routines. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg); static int l2arc_rebuild(l2arc_dev_t *dev); /* L2ARC persistence read I/O routines. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev); static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io); static zio_t *l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab); boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check); static void l2arc_blk_fetch_done(zio_t *zio); static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); } #define HDR_EMPTY(hdr) \ ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) #define HDR_EMPTY_OR_LOCKED(hdr) \ (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr))) #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_GET_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fhdr; uint32_t i; ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (lockp != NULL) { *lockp = hash_lock; mutex_enter(hash_lock); } else { ASSERT(MUTEX_HELD(hash_lock)); } for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } ARCSTAT_BUMP(arcstat_hash_elements); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *hdr) { arc_buf_hdr_t *fhdr, **hdrp; uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_full_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_free() in the linux kernel\ */ vmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #endif for (int i = 0; i < BUF_LOCKS; i++) mutex_destroy(BUF_HASH_LOCK(i)); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; memset(hdr, 0, HDR_FULL_SIZE); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); #endif multilist_link_init(&hdr->b_l1hdr.b_arc_node); list_link_init(&hdr->b_l2hdr.b_l2node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; memset(hdr, 0, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } static int buf_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_t *buf = vbuf; memset(buf, 0, sizeof (arc_buf_t)); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ static void hdr_full_dest(void *vbuf, void *unused) { (void) unused; arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); #ifdef ZFS_DEBUG mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); #endif ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } static void hdr_l2only_dest(void *vbuf, void *unused) { (void) unused; arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } static void buf_dest(void *vbuf, void *unused) { (void) unused; (void) vbuf; arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } static void buf_init(void) { uint64_t *ct = NULL; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < arc_all_memory()) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_alloc() in the linux kernel */ buf_hash_table.ht_table = vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); #else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); #endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, KMC_RECLAIMABLE); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL); } #define ARC_MINTIME (hz>>4) /* 62 ms */ /* * This is the size that the buf occupies in memory. If the buf is compressed, * it will correspond to the compressed size. You should use this method of * getting the buf size unless you explicitly need the logical size. */ uint64_t arc_buf_size(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); } uint64_t arc_buf_lsize(arc_buf_t *buf) { return (HDR_GET_LSIZE(buf->b_hdr)); } /* * This function will return B_TRUE if the buffer is encrypted in memory. * This buffer can be decrypted by calling arc_untransform(). */ boolean_t arc_is_encrypted(arc_buf_t *buf) { return (ARC_BUF_ENCRYPTED(buf) != 0); } /* * Returns B_TRUE if the buffer represents data that has not had its MAC * verified yet. */ boolean_t arc_is_unauthenticated(arc_buf_t *buf) { return (HDR_NOAUTH(buf->b_hdr) != 0); } void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, uint8_t *iv, uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_PROTECTED(hdr)); memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; } /* * Indicates how this buffer is compressed in memory. If it is not compressed * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with * arc_untransform() as long as it is also unencrypted. */ enum zio_compress arc_get_compression(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); } /* * Return the compression algorithm used to store this data in the ARC. If ARC * compression is enabled or this is an encrypted block, this will be the same * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF. */ static inline enum zio_compress arc_hdr_get_compress(arc_buf_hdr_t *hdr) { return (HDR_COMPRESSION_ENABLED(hdr) ? HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); } uint8_t arc_get_complevel(arc_buf_t *buf) { return (buf->b_hdr->b_complevel); } static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); EQUIV(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* * It would be nice to assert arc_can_share() too, but the "hdr isn't * already being shared" requirement prevents us from doing that. */ return (shared); } /* * Free the checksum associated with this header. If there is no checksum, this * is a no-op. */ static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { #ifdef ZFS_DEBUG ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_l1hdr.b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif } /* * Return true iff at least one of the bufs on hdr is not compressed. * Encrypted buffers count as compressed. */ static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) { ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr)); for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { if (!ARC_BUF_COMPRESSED(b)) { return (B_TRUE); } } return (B_FALSE); } /* * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data * matches the checksum that is stored in the hdr. If there is no checksum, * or if the buf is compressed, this is a no-op. */ static void arc_cksum_verify(arc_buf_t *buf) { #ifdef ZFS_DEBUG arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif } /* * This function makes the assumption that data stored in the L2ARC * will be transformed exactly as it is in the main pool. Because of * this we can verify the checksum against the reading process's bp. */ static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); /* * Block pointers always store the checksum for the logical data. * If the block pointer has the gang bit set, then the checksum * it represents is for the reconstituted data and not for an * individual gang member. The zio pipeline, however, must be able to * determine the checksum of each of the gang constituents so it * treats the checksum comparison differently than what we need * for l2arc blocks. This prevents us from using the * zio_checksum_error() interface directly. Instead we must call the * zio_checksum_error_impl() so that we can ensure the checksum is * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ return (zio_checksum_error_impl(zio->io_spa, zio->io_bp, BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); } /* * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a * checksum and attaches it to the buf's hdr so that we can ensure that the buf * isn't modified later on. If buf is compressed or there is already a checksum * on the hdr, this is a no-op (we only checksum uncompressed bufs). */ static void arc_cksum_compute(arc_buf_t *buf) { if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; #ifdef ZFS_DEBUG arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #endif arc_buf_watch(buf); } #ifndef _KERNEL void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { (void) sig, (void) unused; panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr); } #endif static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ | PROT_WRITE)); } #else (void) buf; #endif } static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ)); #else (void) buf; #endif } static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; } VERIFY3U(hdr->b_type, ==, type); return (type); } boolean_t arc_is_metadata(arc_buf_t *buf) { return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); } static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { switch (type) { case ARC_BUFC_DATA: /* metadata field is 0 if buffer contains normal data */ return (0); case ARC_BUFC_METADATA: return (ARC_FLAG_BUFC_METADATA); default: break; } panic("undefined ARC buffer type!"); return ((uint32_t)-1); } void arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_cksum_verify(buf); /* * Compressed buffers do not manipulate the b_freeze_cksum. */ if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); arc_buf_unwatch(buf); } void arc_buf_freeze(arc_buf_t *buf) { if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(buf->b_hdr)); arc_cksum_compute(buf); } /* * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, * the following functions should be used to ensure that the flags are * updated in a thread-safe way. When manipulating the flags either * the hash_lock must be held or the hdr must be undiscoverable. This * ensures that we're not racing with any other threads when updating * the flags. */ static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags &= ~flags; } /* * Setting the compression bits in the arc_buf_hdr_t's b_flags is * done in a special way since we have to clear and set bits * at the same time. Consumers that wish to set the compression bits * must use this function to ensure that the flags are updated in * thread-safe manner. */ static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so * we ignore the compression of the blkptr and set the * want to uncompress them. Mark them as uncompressed. */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } HDR_SET_COMPRESS(hdr, cmp); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); } /* * Looks for another buf on the same hdr which has the data decompressed, copies * from it, and returns true. If no such buf exists, returns false. */ static boolean_t arc_buf_try_copy_decompressed_data(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t copied = B_FALSE; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(buf->b_data, !=, NULL); ASSERT(!ARC_BUF_COMPRESSED(buf)); for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; from = from->b_next) { /* can't use our own data buffer */ if (from == buf) { continue; } if (!ARC_BUF_COMPRESSED(from)) { memcpy(buf->b_data, from->b_data, arc_buf_size(buf)); copied = B_TRUE; break; } } #ifdef ZFS_DEBUG /* * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ if (zfs_flags & ZFS_DEBUG_MODIFY) EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); #endif return (copied); } /* * Allocates an ARC buf header that's in an evicted & L2-cached state. * This is used during l2arc reconstruction to make empty ARC buffers * which circumvent the regular disk->arc->l2arc path and instead come * into being in the reverse order, i.e. l2arc->arc. */ static arc_buf_hdr_t * arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, - dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, + dva_t dva, uint64_t daddr, int32_t psize, uint64_t asize, uint64_t birth, enum zio_compress compress, uint8_t complevel, boolean_t protected, boolean_t prefetch, arc_state_type_t arcs_state) { arc_buf_hdr_t *hdr; ASSERT(size != 0); + ASSERT(dev->l2ad_vdev != NULL); + hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); hdr->b_birth = birth; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); HDR_SET_LSIZE(hdr, size); HDR_SET_PSIZE(hdr, psize); + HDR_SET_L2SIZE(hdr, asize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); if (prefetch) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); hdr->b_dva = dva; hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = daddr; hdr->b_l2hdr.b_arcs_state = arcs_state; return (hdr); } /* * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) { uint64_t size; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && HDR_GET_PSIZE(hdr) > 0) { size = HDR_GET_PSIZE(hdr); } else { ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); size = HDR_GET_LSIZE(hdr); } return (size); } static int arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) { int ret; uint64_t csize; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); abd_t *abd = hdr->b_l1hdr.b_pabd; boolean_t free_abd = B_FALSE; ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr)); ASSERT3P(abd, !=, NULL); /* * The MAC is calculated on the compressed data that is stored on disk. * However, if compressed arc is disabled we will only have the * decompressed data available to us now. Compress it into a temporary * abd so we can verify the MAC. The performance overhead of this will * be relatively low, since most objects in an encrypted objset will * be encrypted (instead of authenticated) anyway. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd = NULL; csize = zio_compress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, &abd, lsize, MIN(lsize, psize), hdr->b_complevel); if (csize >= lsize || csize > psize) { ret = SET_ERROR(EIO); return (ret); } ASSERT3P(abd, !=, NULL); abd_zero_off(abd, csize, psize - csize); free_abd = B_TRUE; } /* * Authentication is best effort. We authenticate whenever the key is * available. If we succeed we clear ARC_FLAG_NOAUTH. */ if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); ASSERT3U(lsize, ==, psize); ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); } else { ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_crypt_hdr.b_mac); } if (ret == 0) arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); else if (ret == ENOENT) ret = 0; if (free_abd) abd_free(abd); return (ret); } /* * This function will take a header that only has raw encrypted data in * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in * b_l1hdr.b_pabd. If designated in the header flags, this function will * also decompress the data. */ static int arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) { int ret; abd_t *cabd = NULL; boolean_t no_crypt = B_FALSE; boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); arc_hdr_alloc_abd(hdr, 0); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) { abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); } /* * If this header has disabled arc compression but the b_pabd is * compressed after decrypting it, we need to decompress the newly * decrypted data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { /* * We want to make sure that we are correctly honoring the * zfs_abd_scatter_enabled setting, so we allocate an abd here * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { goto error; } arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; } return (0); error: arc_hdr_free_abd(hdr, B_FALSE); if (cabd != NULL) arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); return (ret); } /* * This function is called during arc_buf_fill() to prepare the header's * abd plaintext pointer for use. This involves authenticated protected * data and decrypting encrypted data into the plaintext abd. */ static int arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa, const zbookmark_phys_t *zb, boolean_t noauth) { int ret; ASSERT(HDR_PROTECTED(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); if (HDR_NOAUTH(hdr) && !noauth) { /* * The caller requested authenticated data but our data has * not been authenticated yet. Verify the MAC now if we can. */ ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset); if (ret != 0) goto error; } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) { /* * If we only have the encrypted version of the data, but the * unencrypted version was requested we take this opportunity * to store the decrypted version in the header for future use. */ ret = arc_hdr_decrypt(hdr, spa, zb); if (ret != 0) goto error; } ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); if (hash_lock != NULL) mutex_exit(hash_lock); return (0); error: if (hash_lock != NULL) mutex_exit(hash_lock); return (ret); } /* * This function is used by the dbuf code to decrypt bonus buffers in place. * The dbuf code itself doesn't have any locking for decrypting a shared dnode * block, so we use the hash lock here to protect against concurrent calls to * arc_buf_fill(). */ static void arc_buf_untransform_in_place(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } /* * Given a buf that has a data buffer attached to it, this function will * efficiently fill the buf with data of the specified compression setting from * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr * are already sharing a data buf, no copy is performed. * * If the buf is marked as compressed but uncompressed data was requested, this * will allocate a new data buffer for the buf, remove that flag, and fill the * buf with uncompressed data. You can't request a compressed buf on a hdr with * uncompressed data, and (since we haven't added support for it yet) if you * want compressed data your buf must already be marked as compressed and have * the correct-sized data buffer. */ static int arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, arc_fill_flags_t flags) { int error = 0; arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t hdr_compressed = (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0; boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr); ASSERT3P(buf->b_data, !=, NULL); IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf)); IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, HDR_ENCRYPTED(hdr)); IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, !arc_buf_is_shared(buf)); /* * If the caller wanted encrypted data we just need to copy it from * b_rabd and potentially byteswap it. We won't be able to do any * further transforms on it. */ if (encrypted) { ASSERT(HDR_HAS_RABD(hdr)); abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); goto byteswap; } /* * Adjust encrypted and authenticated headers to accommodate * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are * allowed to fail decryption due to keys not being loaded * without being marked as an IO error. */ if (HDR_PROTECTED(hdr)) { error = arc_fill_hdr_crypt(hdr, hash_lock, spa, zb, !!(flags & ARC_FILL_NOAUTH)); if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) { return (error); } else if (error != 0) { if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (error); } } /* * There is a special case here for dnode blocks which are * decrypting their bonus buffers. These blocks may request to * be decrypted in-place. This is necessary because there may * be many dnodes pointing into this buffer and there is * currently no method to synchronize replacing the backing * b_data buffer and updating all of the pointers. Here we use * the hash lock to ensure there are no races. If the need * arises for other types to be decrypted in-place, they must * add handling here as well. */ if ((flags & ARC_FILL_IN_PLACE) != 0) { ASSERT(!hdr_compressed); ASSERT(!compressed); ASSERT(!encrypted); if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); if (hash_lock != NULL) mutex_enter(hash_lock); arc_buf_untransform_in_place(buf); if (hash_lock != NULL) mutex_exit(hash_lock); /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); } return (0); } if (hdr_compressed == compressed) { if (ARC_BUF_SHARED(buf)) { ASSERT(arc_buf_is_shared(buf)); } else { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { ASSERT(hdr_compressed); ASSERT(!compressed); /* * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ if (ARC_BUF_SHARED(buf)) { ASSERTF(ARC_BUF_COMPRESSED(buf), "buf %p was uncompressed", buf); /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { ASSERT(!arc_buf_is_shared(buf)); /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); /* We increased the size of b_data; update overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); } /* * Regardless of the buf's previous compression settings, it * should not be compressed at the end of this function. */ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; /* * Try copying the data from another buf which already has a * decompressed version. If that's not possible, it's time to * bite the bullet and decompress the data from the hdr. */ if (arc_buf_try_copy_decompressed_data(buf)) { /* Skip byteswapping and checksumming (already done) */ return (0); } else { abd_t dabd; abd_get_from_buf_struct(&dabd, buf->b_data, HDR_GET_LSIZE(hdr)); error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, &dabd, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); abd_free(&dabd); /* * Absent hardware errors or software bugs, this should * be impossible, but log it anyway so we can debug it. */ if (error != 0) { zfs_dbgmsg( "hdr %px, compress %d, psize %d, lsize %d", hdr, arc_hdr_get_compress(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (SET_ERROR(EIO)); } } } byteswap: /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); return (0); } /* * If this function is being called to decrypt an encrypted buffer or verify an * authenticated one, the key must be loaded and a mapping must be made * available in the keystore via spa_keystore_create_mapping() or one of its * callers. */ int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, boolean_t in_place) { int ret; arc_fill_flags_t flags = 0; if (in_place) flags |= ARC_FILL_IN_PLACE; ret = arc_buf_fill(buf, spa, zb, flags); if (ret == ECKSUM) { /* * Convert authentication and decryption errors to EIO * (and generate an ereport) before leaving the ARC. */ ret = SET_ERROR(EIO); spa_log_error(spa, zb, buf->b_hdr->b_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } return (ret); } /* * Increment the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Decrement the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Add a reference to this hdr indicating that someone is actively * referencing that memory. When the refcount transitions from 0 to 1, * we remove it from the respective arc_state_t list to indicate that * it is not evictable. */ static void add_reference(arc_buf_hdr_t *hdr, const void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && state != arc_anon && state != arc_l2c_only) { /* We don't use the L2-only state list. */ multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_decrement(hdr, state); } } /* * Remove a reference from this hdr. When the reference transitions from * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's * list making it eligible for eviction. */ static int remove_reference(arc_buf_hdr_t *hdr, const void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(!GHOST_STATE(state)); /* arc_l2c_only counts as a ghost. */ if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0) return (cnt); if (state == arc_anon) { arc_hdr_destroy(hdr); return (0); } if (state == arc_uncached && !HDR_PREFETCH(hdr)) { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); return (0); } multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_increment(hdr, state); return (0); } /* * Returns detailed information about a specific arc buffer. When the * state_index argument is set the function will calculate the arc header * list position for its arc state. Since this requires a linear traversal * callers are strongly encourage not to do this. However, it can be helpful * for targeted analysis so the functionality is provided. */ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { (void) state_index; arc_buf_hdr_t *hdr = ab->b_hdr; l1arc_buf_hdr_t *l1hdr = NULL; l2arc_buf_hdr_t *l2hdr = NULL; arc_state_t *state = NULL; memset(abi, 0, sizeof (arc_buf_info_t)); if (hdr == NULL) return; abi->abi_flags = hdr->b_flags; if (HDR_HAS_L1HDR(hdr)) { l1hdr = &hdr->b_l1hdr; state = l1hdr->b_state; } if (HDR_HAS_L2HDR(hdr)) l2hdr = &hdr->b_l2hdr; if (l1hdr) { abi->abi_bufcnt = 0; for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next) abi->abi_bufcnt++; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; abi->abi_mfu_hits = l1hdr->b_mfu_hits; abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt); } if (l2hdr) { abi->abi_l2arc_dattr = l2hdr->b_daddr; abi->abi_l2arc_hits = l2hdr->b_hits; } abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; abi->abi_state_contents = arc_buf_type(hdr); abi->abi_size = arc_hdr_size(hdr); } /* * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) { arc_state_t *old_state; int64_t refcnt; boolean_t update_old, update_new; arc_buf_contents_t type = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() * in arc_read() when bringing a buffer out of the L2ARC. However, the * L1 hdr doesn't always exist when we change state to arc_anon before * destroying a header, in which case reallocating to add the L1 hdr is * pointless. */ if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); update_old = (hdr->b_l1hdr.b_buf != NULL || hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL); IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL); IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL || ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); } else { old_state = arc_l2c_only; refcnt = 0; update_old = B_FALSE; } update_new = update_old; if (GHOST_STATE(old_state)) update_old = B_TRUE; if (GHOST_STATE(new_state)) update_new = B_TRUE; ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(new_state, !=, old_state); /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); /* remove_reference() saves on insert. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { multilist_remove(&old_state->arcs_list[type], hdr); arc_evictable_space_decrement(hdr, old_state); } } if (new_state != arc_anon && new_state != arc_l2c_only) { /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or * anonymous), we realloc the header to add an L1hdr * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[type], hdr); arc_evictable_space_increment(hdr, new_state); } } ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { /* * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have no arc * buffer to use for the reference. As a result, we * use the arc header pointer for the reference. */ (void) zfs_refcount_add_many( &new_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many( &new_state->arcs_size[type], arc_buf_size(buf), buf); } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( &new_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many( &new_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } } if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * When moving a header off of a ghost state, * the header will not contain any arc buffers. * We use the arc header pointer for the reference * which is exactly what we did when we put the * header on the ghost state. */ (void) zfs_refcount_remove_many( &old_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); } else { /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many( &old_state->arcs_size[type], arc_buf_size(buf), buf); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many( &old_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many( &old_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } } if (HDR_HAS_L1HDR(hdr)) { hdr->b_l1hdr.b_state = new_state; if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) { l2arc_hdr_arcstats_decrement_state(hdr); hdr->b_l2hdr.b_arcs_state = new_state->arcs_state; l2arc_hdr_arcstats_increment_state(hdr); } } } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: ARCSTAT_INCR(arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, space); break; case ARC_SPACE_ABD_CHUNK_WASTE: /* * Note: this includes space wasted by all scatter ABD's, not * just those allocated by the ARC. But the vast majority of * scatter ABD's come from the ARC, because other users are * very short-lived. */ ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) ARCSTAT_INCR(arcstat_meta_used, space); aggsum_add(&arc_sums.arcstat_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, -space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: ARCSTAT_INCR(arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space); break; case ARC_SPACE_ABD_CHUNK_WASTE: ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) ARCSTAT_INCR(arcstat_meta_used, -space); ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0); aggsum_add(&arc_sums.arcstat_size, -space); } /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) { /* * The criteria for sharing a hdr's data are: * 1. the buffer is not encrypted * 2. the hdr's compression matches the buf's compression * 3. the hdr doesn't need to be byteswapped * 4. the hdr isn't already being shared * 5. the buf is either compressed or it is the last buf in the hdr list * * Criterion #5 maintains the invariant that shared uncompressed * bufs must be the final buf in the hdr's b_buf list. Reading this, you * might ask, "if a compressed buf is allocated first, won't that be the * last thing in the list?", but in that case it's impossible to create * a shared uncompressed buf anyway (because the hdr must be compressed * to have the compressed buf). You might also think that #3 is * sufficient to make this guarantee, however it's possible * (specifically in the rare L2ARC write race mentioned in * arc_buf_alloc_impl()) there will be an existing uncompressed buf that * is shareable, but wasn't at the time of its allocation. Rather than * allow a new shared uncompressed buf to be created and then shuffle * the list around to make it the last element, this simply disallows * sharing if the new buf isn't the first to be added. */ ASSERT3P(buf->b_hdr, ==, hdr); boolean_t hdr_compressed = arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF; boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; return (!ARC_BUF_ENCRYPTED(buf) && buf_compressed == hdr_compressed && hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && !HDR_SHARED_DATA(hdr) && (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); } /* * Allocate a buf for this hdr. If you care about the data that's in the hdr, * or if you want a compressed buffer, pass those flags in. Returns 0 if the * copy was made successfully, or an error code otherwise. */ static int arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, const void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; arc_fill_flags_t flags = ARC_FILL_LOCKED; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); ASSERT3P(*ret, ==, NULL); IMPLY(encrypted, compressed); buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = hdr->b_l1hdr.b_buf; buf->b_flags = 0; add_reference(hdr, tag); /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually * compressed. This must be overridden if the buffer is encrypted since * encrypted buffers cannot be decompressed. */ if (encrypted) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED; flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED; } else if (compressed && arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; flags |= ARC_FILL_COMPRESSED; } if (noauth) { ASSERT0(encrypted); flags |= ARC_FILL_NOAUTH; } /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new * buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be * actively involved in an L2ARC write, because if this buf is used by * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. * Second, the hdr's ABD must be linear so that the buf's user doesn't * need to be ABD-aware. It must be allocated via * zio_[data_]buf_alloc(), not as a page, because we need to be able * to abd_release_ownership_of_buf(), which isn't allowed on "linear * page" buffers because the ABD code needs to handle freeing them * specially. */ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd) && !abd_is_linear_page(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { buf->b_data = arc_get_data_buf(hdr, arc_buf_size(buf), buf); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; /* * If the user wants the data from the hdr, we need to either copy or * decompress the data. */ if (fill) { ASSERT3P(zb, !=, NULL); return (arc_buf_fill(buf, spa, zb, flags)); } return (0); } static const char *arc_onloan_tag = "onloan"; static inline void arc_loaned_bytes_update(int64_t delta) { atomic_add_64(&arc_loaned_bytes, delta); /* assert that it did not wrap around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); } /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, psize, lsize, compression_type, complevel); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, byteorder, salt, iv, mac, ot, psize, lsize, compression_type, complevel); atomic_add_64(&arc_loaned_bytes, psize); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); arc_loaned_bytes_update(-arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); arc_loaned_bytes_update(arc_buf_size(buf)); } static void l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } static void arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, hdr); } (void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } if (free_rdata) { l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type); } else { l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } } /* * Share the arc_buf_t's data with the hdr. Whenever we are sharing the * data buffer, we transfer the refcount ownership to the hdr and update * the appropriate kstats. */ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Start sharing the data buffer. We transfer the * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ zfs_refcount_transfer_ownership_many( &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need * to increment its compressed and uncompressed kstats and * decrement the overhead size. */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ zfs_refcount_transfer_ownership_many( &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); abd_free(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between * the arc buf and the hdr, count it as overhead. */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* * Remove an arc_buf_t from the hdr's buf list and return the last * arc_buf_t on the list. If no buffers remain on the list then return * NULL. */ static arc_buf_t * arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; arc_buf_t *lastbuf = NULL; /* * Remove the buf from the hdr list and locate the last * remaining buffer on the list. */ while (*bufp != NULL) { if (*bufp == buf) *bufp = buf->b_next; /* * If we've removed a buffer in the middle of * the list then update the lastbuf and update * bufp. */ if (*bufp != NULL) { lastbuf = *bufp; bufp = &(*bufp)->b_next; } } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); } /* * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's * list and free it. */ static void arc_buf_destroy_impl(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Free up the data associated with the buf but only if we're not * sharing this with the hdr. If we are sharing it with the hdr, the * hdr is responsible for doing the free. */ if (buf->b_data != NULL) { /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); if (ARC_BUF_SHARED(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { ASSERT(!arc_buf_is_shared(buf)); uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; /* * If we have no more encrypted buffers and we've already * gotten a copy of the decrypted data we can free b_rabd * to save some space. */ if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) { arc_buf_t *b; for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) { if (b != buf && ARC_BUF_ENCRYPTED(b)) break; } if (b == NULL) arc_hdr_free_abd(hdr, B_TRUE); } } arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * * There is an equivalent case for compressed bufs, but since * they aren't guaranteed to be the last buf in the list and * that is an exceedingly rare case, we just allow that space be * wasted temporarily. We must also be careful not to share * encrypted buffers, since they cannot be shared. */ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ ASSERT(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_abd(hdr, B_FALSE); /* * We must setup a new shared block between the * last buffer and the hdr. The data would have * been allocated by the arc buf so we need to transfer * ownership to the hdr since it's now being shared. */ arc_share_buf(hdr, lastbuf); } } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT3P(lastbuf, !=, NULL); ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); } /* * Free the checksum if we're removing the last uncompressed buf from * this hdr. */ if (!arc_hdr_has_uncompressed_buf(hdr)) { arc_cksum_free(hdr); } /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata); IMPLY(alloc_rdata, HDR_PROTECTED(hdr)); if (alloc_rdata) { size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ARCSTAT_INCR(arcstat_compressed_size, size); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) { uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); IMPLY(free_rdata, HDR_HAS_RABD(hdr)); /* * If the hdr is currently being written to the l2arc then * we defer freeing the data by adding it to the l2arc_free_on_write * list. The l2arc will free the data once it's finished * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { arc_hdr_free_on_write(hdr, free_rdata); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else if (free_rdata) { arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr); } else { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr); } if (free_rdata) { hdr->b_crypt_hdr.b_rabd = NULL; ARCSTAT_INCR(arcstat_raw_size, -size); } else { hdr->b_l1hdr.b_pabd = NULL; } if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -size); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } /* * Allocate empty anonymous ARC header. The header will get its identity * assigned and buffers attached later as part of read or write operations. * * In case of read arc_read() assigns header its identify (b_dva + b_birth), * inserts it into ARC hash to become globally visible and allocates physical * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read * completion arc_read_done() allocates ARC buffer(s) as needed, potentially * sharing one of them with the physical ABD buffer. * * In case of write arc_alloc_buf() allocates ARC buffer to be filled with * data. Then after compression and/or encryption arc_write_ready() allocates * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD * buffer. On disk write completion arc_write_done() assigns the header its * new identity (b_dva + b_birth) and inserts into ARC hash. * * In case of partial overwrite the old data is read first as described. Then * arc_release() either allocates new anonymous ARC header and moves the ARC * buffer to it, or reuses the old ARC header by discarding its identity and * removing it from ARC hash. After buffer modification normal write process * follows as described. */ static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, boolean_t protected, enum zio_compress compression_type, uint8_t complevel, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(HDR_EMPTY(hdr)); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); hdr->b_spa = spa; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_buf = NULL; ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); } /* * Transition between the two allocation states for the arc_buf_hdr struct. * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller * version is used when a cache buffer is only in the L2ARC in order to reduce * memory usage. */ static arc_buf_hdr_t * arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { ASSERT(HDR_HAS_L2HDR(hdr)); arc_buf_hdr_t *nhdr; l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); memcpy(nhdr, hdr, HDR_L2ONLY_SIZE); if (new == hdr_full_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a * header has just come out of L2ARC, so we set its state to * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif /* * If we've reached here, We must have been called from * arc_evict_hdr(), as such we should have already been * removed from any ghost list we were previously on * (which protects us from racing with arc_evict_state), * thus no locking is needed during this check. */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } /* * The header has been reallocated so we need to re-insert it into any * lists it was on. */ (void) buf_hash_insert(nhdr, NULL); ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); mutex_enter(&dev->l2ad_mtx); /* * We must place the realloc'ed header back into the list at * the same spot. Otherwise, if it's placed earlier in the list, * l2arc_write_buffers() could find it during the function's * write phase, and try to write it out to the l2arc. */ list_insert_after(&dev->l2ad_buflist, hdr, nhdr); list_remove(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Since we're using the pointer address as the tag when * incrementing and decrementing the l2ad_alloc refcount, we * must remove the old pointer (that we're about to destroy) and * add the new pointer to the refcount. Otherwise we'd remove * the wrong pointer address when calling arc_hdr_destroy() later. */ (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); buf_discard_identity(hdr); kmem_cache_free(old, hdr); return (nhdr); } /* * This function is used by the send / receive code to convert a newly * allocated arc_buf_t to one that is suitable for a raw encrypted write. It * is also used to allow the root objset block to be updated without altering * its embedded MACs. Both block types will always be uncompressed so we do not * have to worry about compression type or psize. */ void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); if (salt != NULL) memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); if (iv != NULL) memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); if (mac != NULL) memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); } /* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, B_FALSE, ZIO_COMPRESS_OFF, 0, type); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } /* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ arc_buf_t * arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_FALSE, compression_type, complevel, ARC_BUFC_DATA); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); /* * To ensure that the hdr has the correct data in it if we call * arc_untransform() on this buf before it's been written to disk, * it's easiest if we just set up sharing between the buf and the hdr. */ arc_share_buf(hdr, buf); return (buf); } arc_buf_t * arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_hdr_t *hdr; arc_buf_t *buf; arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ? ARC_BUFC_METADATA : ARC_BUFC_DATA; ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, compression_type, complevel, type); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); /* * This buffer will be considered encrypted even if the ot is not an * encrypted type. It will become authenticated instead in * arc_write_ready(). */ buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only) { - l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; - l2arc_dev_t *dev = l2hdr->b_dev; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); - uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + uint64_t asize = HDR_GET_L2SIZE(hdr); arc_buf_contents_t type = hdr->b_type; int64_t lsize_s; int64_t psize_s; int64_t asize_s; + /* For L2 we expect the header's b_l2size to be valid */ + ASSERT3U(asize, >=, psize); + if (incr) { lsize_s = lsize; psize_s = psize; asize_s = asize; } else { lsize_s = -lsize; psize_s = -psize; asize_s = -asize; } /* If the buffer is a prefetch, count it as such. */ if (HDR_PREFETCH(hdr)) { ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s); } else { /* * We use the value stored in the L2 header upon initial * caching in L2ARC. This value will be updated in case * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC * metadata (log entry) cannot currently be updated. Having * the ARC state in the L2 header solves the problem of a * possibly absent L1 header (apparent in buffers restored * from persistent L2ARC). */ switch (hdr->b_l2hdr.b_arcs_state) { case ARC_STATE_MRU_GHOST: case ARC_STATE_MRU: ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s); break; case ARC_STATE_MFU_GHOST: case ARC_STATE_MFU: ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s); break; default: break; } } if (state_only) return; ARCSTAT_INCR(arcstat_l2_psize, psize_s); ARCSTAT_INCR(arcstat_l2_lsize, lsize_s); switch (type) { case ARC_BUFC_DATA: ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s); break; case ARC_BUFC_METADATA: ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s); break; default: break; } } static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; - uint64_t psize = HDR_GET_PSIZE(hdr); - uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); l2arc_hdr_arcstats_decrement(hdr); - vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + if (dev->l2ad_vdev != NULL) { + uint64_t asize = HDR_GET_L2SIZE(hdr); + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + } (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); if (!buflist_held) mutex_enter(&dev->l2ad_mtx); /* * Even though we checked this conditional above, we * need to check this again now that we have the * l2ad_mtx. This is because we could be racing with * another thread calling l2arc_evict() which might have * destroyed this header's L2 portion as we were waiting * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ if (HDR_HAS_L2HDR(hdr)) { if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); arc_hdr_l2hdr_destroy(hdr); } if (!buflist_held) mutex_exit(&dev->l2ad_mtx); } /* * The header's identify can only be safely discarded once it is no * longer discoverable. This requires removing it from the hash table * and the l2arc header list. After this point the hash lock can not * be used to protect the header. */ if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void arc_buf_destroy(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); ASSERT(ARC_BUF_LAST(buf)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, tag)); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); arc_buf_destroy_impl(buf); (void) remove_reference(hdr, tag); mutex_exit(hash_lock); } /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost * - arc_mfu -> arc_mfu_ghost * - arc_mru_ghost -> arc_l2c_only * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted * - arc_uncached -> deleted * * Return total size of evicted data buffers for eviction progress tracking. * When evicting from ghost states return logical buffer size to make eviction * progress at the same (or at least comparable) rate as from non-ghost states. * * Return *real_evicted for actual ARC size reduction to wake up threads * waiting for it. For non-ghost states it includes size of evicted data * buffers (the headers are not freed there). For ghost states it includes * only the evicted headers size. */ static int64_t arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { /* * l2arc_write_buffers() relies on a header's L1 portion * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { ARCSTAT_BUMP(arcstat_evict_l2_skip); return (bytes_evicted); } ARCSTAT_BUMP(arcstat_deleted); bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pabd == NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ (void) arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } return (bytes_evicted); } ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached); evicted_state = (state == arc_uncached) ? arc_anon : ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost); /* prefetch buffers have a minimum lifespan */ if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < MSEC_TO_TICK(min_lifetime)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); switch (state->arcs_state) { case ARC_STATE_MRU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mru, HDR_GET_LSIZE(hdr)); break; case ARC_STATE_MFU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mfu, HDR_GET_LSIZE(hdr)); break; default: break; } } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); } } bytes_evicted += arc_hdr_size(hdr); *real_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed buffer then we * discard it here before we change states. This ensures that the * accounting is updated correctly in arc_free_data_impl(). */ if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); arc_change_state(evicted_state, hdr); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); if (evicted_state == arc_anon) { arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } else { ASSERT(HDR_IN_HASH_TABLE(hdr)); } return (bytes_evicted); } static void arc_set_need_free(void) { ASSERT(MUTEX_HELD(&arc_evict_lock)); int64_t remaining = arc_free_memory() - arc_sys_free / 2; arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters); if (aw == NULL) { arc_need_free = MAX(-remaining, 0); } else { arc_need_free = MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count)); } } static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, uint64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0, real_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint_t evict_count = zfs_arc_evict_batch_limit; ASSERT3P(marker, !=, NULL); mls = multilist_sublist_lock_idx(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { if ((evict_count == 0) || (bytes_evicted >= bytes)) break; /* * To keep our iteration location, move the marker * forward. Since we're not holding hdr's hash lock, we * must be very careful and not remove 'hdr' from the * sublist. Otherwise, other consumers might mistake the * 'hdr' as not being on a sublist when they call the * multilist_link_active() function (they all rely on * the hash lock protecting concurrent insertions and * removals). multilist_sublist_move_forward() was * specifically implemented to ensure this is the case * (only 'marker' will be removed and re-inserted). */ multilist_sublist_move_forward(mls, marker); /* * The only case where the b_spa field should ever be * zero, is the marker headers inserted by * arc_evict_state(). It's possible for multiple threads * to be calling arc_evict_state() concurrently (e.g. * dsl_pool_close() and zio_inject_fault()), so we must * skip any markers we see from these other threads. */ if (hdr->b_spa == 0) continue; /* we're only interested in evicting buffers of a certain spa */ if (spa != 0 && hdr->b_spa != spa) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); /* * We aren't calling this function from any code path * that would already be holding a hash lock, so we're * asserting on this assumption to be defensive in case * this ever changes. Without this check, it would be * possible to incorrectly increment arcstat_mutex_miss * below (e.g. if the code changed such that we called * this function with a hash lock held). */ ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { uint64_t revicted; uint64_t evicted = arc_evict_hdr(hdr, &revicted); mutex_exit(hash_lock); bytes_evicted += evicted; real_evicted += revicted; /* * If evicted is zero, arc_evict_hdr() must have * decided to skip this header, don't increment * evict_count in this case. */ if (evicted != 0) evict_count--; } else { ARCSTAT_BUMP(arcstat_mutex_miss); } } multilist_sublist_unlock(mls); /* * Increment the count of evicted bytes, and wake up any threads that * are waiting for the count to reach this value. Since the list is * ordered by ascending aew_count, we pop off the beginning of the * list until we reach the end, or a waiter that's past the current * "count". Doing this outside the loop reduces the number of times * we need to acquire the global arc_evict_lock. * * Only wake when there's sufficient free memory in the system * (specifically, arc_sys_free/2, which by default is a bit more than * 1/64th of RAM). See the comments in arc_wait_for_eviction(). */ mutex_enter(&arc_evict_lock); arc_evict_count += real_evicted; if (arc_free_memory() > arc_sys_free / 2) { arc_evict_waiter_t *aw; while ((aw = list_head(&arc_evict_waiters)) != NULL && aw->aew_count <= arc_evict_count) { list_remove(&arc_evict_waiters, aw); cv_broadcast(&aw->aew_cv); } } arc_set_need_free(); mutex_exit(&arc_evict_lock); /* * If the ARC size is reduced from arc_c_max to arc_c_min (especially * if the average cached block is small), eviction can be on-CPU for * many seconds. To ensure that other threads that may be bound to * this CPU are able to make progress, make a voluntary preemption * call here. */ kpreempt(KPREEMPT_SYNC); return (bytes_evicted); } static arc_buf_hdr_t * arc_state_alloc_marker(void) { arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); /* * A b_spa of 0 is used to indicate that this header is * a marker. This fact is used in arc_evict_state_impl(). */ marker->b_spa = 0; return (marker); } static void arc_state_free_marker(arc_buf_hdr_t *marker) { kmem_cache_free(hdr_full_cache, marker); } /* * Allocate an array of buffer headers used as placeholders during arc state * eviction. */ static arc_buf_hdr_t ** arc_state_alloc_markers(int count) { arc_buf_hdr_t **markers; markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP); for (int i = 0; i < count; i++) markers[i] = arc_state_alloc_marker(); return (markers); } static void arc_state_free_markers(arc_buf_hdr_t **markers, int count) { for (int i = 0; i < count; i++) arc_state_free_marker(markers[i]); kmem_free(markers, sizeof (*markers) * count); } /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the * appropriate evict state. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so, may not catch all candidates. * It may also return without evicting as much space as requested. * * If bytes is specified using the special value ARC_EVICT_ALL, this * will evict all available (i.e. unlocked and evictable) buffers from * the given arc state; which is used by arc_flush(). */ static uint64_t arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, uint64_t bytes) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; num_sublists = multilist_get_num_sublists(ml); /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes * to evict, we want to keep trying. The markers allow us to * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ if (zthr_iscurthread(arc_evict_zthr)) { markers = arc_state_evict_markers; ASSERT3S(num_sublists, <=, arc_state_evict_marker_count); } else { markers = arc_state_alloc_markers(num_sublists); } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; mls = multilist_sublist_lock_idx(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ while (total_evicted < bytes) { int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ for (int i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, markers[sublist_idx], spa, bytes_remaining); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; /* we've reached the end, wrap to the beginning */ if (++sublist_idx >= num_sublists) sublist_idx = 0; } /* * If we didn't evict anything during this scan, we have * no reason to believe we'll evict more during another * scan, so break the loop. */ if (scan_evicted == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); /* * When bytes is ARC_EVICT_ALL, the only way to * break the loop is when scan_evicted is zero. * In that case, we actually have evicted enough, * so we don't want to increment the kstat. */ if (bytes != ARC_EVICT_ALL) { ASSERT3S(total_evicted, <, bytes); ARCSTAT_BUMP(arcstat_evict_not_enough); } break; } } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); } if (markers != arc_state_evict_markers) arc_state_free_markers(markers, num_sublists); return (total_evicted); } /* * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might * wind up in an infinite loop, continually trying to evict buffers. */ static uint64_t arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL); if (!retry) break; } return (evicted); } /* * Evict the specified number of bytes from the state specified. This * function prevents us from trying to evict more from a state's list * than is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes) { uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, type, 0, delta)); } return (0); } /* * Adjust specified fraction, taking into account initial ghost state(s) size, * ghost hit bytes towards increasing the fraction, ghost hit bytes towards * decreasing it, plus a balance factor, controlling the decrease rate, used * to balance metadata vs data. */ static uint64_t arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down, uint_t balance) { if (total < 8 || up + down == 0) return (frac); /* * We should not have more ghost hits than ghost size, but they * may get close. Restrict maximum adjustment in that case. */ if (up + down >= total / 4) { uint64_t scale = (up + down) / (total / 8); up /= scale; down /= scale; } /* Get maximal dynamic range by choosing optimal shifts. */ int s = highbit64(total); s = MIN(64 - s, 32); uint64_t ofrac = (1ULL << 32) - frac; if (frac >= 4 * ofrac) up /= frac / (2 * ofrac + 1); up = (up << s) / (total >> (32 - s)); if (ofrac >= 4 * frac) down /= ofrac / (2 * frac + 1); down = (down << s) / (total >> (32 - s)); down = down * 100 / balance; return (frac + up - down); } /* * Calculate (x * multiplier / divisor) without unnecesary overflows. */ static uint64_t arc_mf(uint64_t x, uint64_t multiplier, uint64_t divisor) { uint64_t q = (x / divisor); uint64_t r = (x % divisor); return ((q * multiplier) + ((r * multiplier) / divisor)); } /* * Evict buffers from the cache, such that arcstat_size is capped by arc_c. */ static uint64_t arc_evict(void) { uint64_t bytes, total_evicted = 0; int64_t e, mrud, mrum, mfud, mfum, w; static uint64_t ogrd, ogrm, ogfd, ogfm; static uint64_t gsrd, gsrm, gsfd, gsfm; uint64_t ngrd, ngrm, ngfd, ngfm; /* Get current size of ARC states we can evict from. */ mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]); mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]); mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]); mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); uint64_t d = mrud + mfud; uint64_t m = mrum + mfum; uint64_t t = d + m; /* Get ARC ghost hits since last eviction. */ ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); uint64_t grd = ngrd - ogrd; ogrd = ngrd; ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); uint64_t grm = ngrm - ogrm; ogrm = ngrm; ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); uint64_t gfd = ngfd - ogfd; ogfd = ngfd; ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); uint64_t gfm = ngfm - ogfm; ogfm = ngfm; /* Adjust ARC states balance based on ghost hits. */ arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm, grm + gfm, grd + gfd, zfs_arc_meta_balance); arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100); arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100); uint64_t asize = aggsum_value(&arc_sums.arcstat_size); uint64_t ac = arc_c; int64_t wt = t - (asize - ac); /* * Try to reduce pinned dnodes if more than 3/4 of wanted metadata * target is not evictable or if they go over arc_dnode_limit. */ int64_t prune = 0; int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); w = wt * (int64_t)(arc_meta >> 16) >> 16; if (nem > w * 3 / 4) { prune = dn / sizeof (dnode_t) * zfs_arc_dnode_reduce_percent / 100; if (nem < w && w > 4) prune = arc_mf(prune, nem - w * 3 / 4, w / 4); } if (dn > arc_dnode_limit) { prune = MAX(prune, (dn - arc_dnode_limit) / sizeof (dnode_t) * zfs_arc_dnode_reduce_percent / 100); } if (prune > 0) arc_prune_async(prune); /* Evict MRU metadata. */ w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16; e = MIN((int64_t)(asize - ac), (int64_t)(mrum - w)); bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e); total_evicted += bytes; mrum -= bytes; asize -= bytes; /* Evict MFU metadata. */ w = wt * (int64_t)(arc_meta >> 16) >> 16; e = MIN((int64_t)(asize - ac), (int64_t)(m - bytes - w)); bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e); total_evicted += bytes; mfum -= bytes; asize -= bytes; /* Evict MRU data. */ wt -= m - total_evicted; w = wt * (int64_t)(arc_pd >> 16) >> 16; e = MIN((int64_t)(asize - ac), (int64_t)(mrud - w)); bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e); total_evicted += bytes; mrud -= bytes; asize -= bytes; /* Evict MFU data. */ e = asize - ac; bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e); mfud -= bytes; total_evicted += bytes; /* * Evict ghost lists * * Size of each state's ghost list represents how much that state * may grow by shrinking the other states. Would it need to shrink * other states to zero (that is unlikely), its ghost size would be * equal to sum of other three state sizes. But excessive ghost * size may result in false ghost hits (too far back), that may * never result in real cache hits if several states are competing. * So choose some arbitraty point of 1/2 of other state sizes. */ gsrd = (mrum + mfud + mfum) / 2; e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) - gsrd; (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e); gsrm = (mrud + mfud + mfum) / 2; e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) - gsrm; (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e); gsfd = (mrud + mrum + mfum) / 2; e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) - gsfd; (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e); gsfm = (mrud + mrum + mfud) / 2; e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) - gsfm; (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e); return (total_evicted); } -void -arc_flush(spa_t *spa, boolean_t retry) +static void +arc_flush_impl(uint64_t guid, boolean_t retry) { - uint64_t guid = 0; - - /* - * If retry is B_TRUE, a spa must not be specified since we have - * no good way to determine if all of a spa's buffers have been - * evicted from an arc state. - */ - ASSERT(!retry || spa == NULL); - - if (spa != NULL) - guid = spa_load_guid(spa); + ASSERT(!retry || guid == 0); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry); } +void +arc_flush(spa_t *spa, boolean_t retry) +{ + /* + * If retry is B_TRUE, a spa must not be specified since we have + * no good way to determine if all of a spa's buffers have been + * evicted from an arc state. + */ + ASSERT(!retry || spa == NULL); + + arc_flush_impl(spa != NULL ? spa_load_guid(spa) : 0, retry); +} + +static arc_async_flush_t * +arc_async_flush_add(uint64_t spa_guid, uint_t level) +{ + arc_async_flush_t *af = kmem_alloc(sizeof (*af), KM_SLEEP); + af->af_spa_guid = spa_guid; + af->af_cache_level = level; + taskq_init_ent(&af->af_tqent); + list_link_init(&af->af_node); + + mutex_enter(&arc_async_flush_lock); + list_insert_tail(&arc_async_flush_list, af); + mutex_exit(&arc_async_flush_lock); + + return (af); +} + +static void +arc_async_flush_remove(uint64_t spa_guid, uint_t level) +{ + mutex_enter(&arc_async_flush_lock); + for (arc_async_flush_t *af = list_head(&arc_async_flush_list); + af != NULL; af = list_next(&arc_async_flush_list, af)) { + if (af->af_spa_guid == spa_guid && + af->af_cache_level == level) { + list_remove(&arc_async_flush_list, af); + kmem_free(af, sizeof (*af)); + break; + } + } + mutex_exit(&arc_async_flush_lock); +} + +static void +arc_flush_task(void *arg) +{ + arc_async_flush_t *af = arg; + hrtime_t start_time = gethrtime(); + uint64_t spa_guid = af->af_spa_guid; + + arc_flush_impl(spa_guid, B_FALSE); + arc_async_flush_remove(spa_guid, af->af_cache_level); + + uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); + if (elaspsed > 0) { + zfs_dbgmsg("spa %llu arc flushed in %llu ms", + (u_longlong_t)spa_guid, (u_longlong_t)elaspsed); + } +} + +/* + * ARC buffers use the spa's load guid and can continue to exist after + * the spa_t is gone (exported). The blocks are orphaned since each + * spa import has a different load guid. + * + * It's OK if the spa is re-imported while this asynchronous flush is + * still in progress. The new spa_load_guid will be different. + * + * Also, arc_fini will wait for any arc_flush_task to finish. + */ +void +arc_flush_async(spa_t *spa) +{ + uint64_t spa_guid = spa_load_guid(spa); + arc_async_flush_t *af = arc_async_flush_add(spa_guid, 1); + + taskq_dispatch_ent(arc_flush_taskq, arc_flush_task, + af, TQ_SLEEP, &af->af_tqent); +} + +/* + * Check if a guid is still in-use as part of an async teardown task + */ +boolean_t +arc_async_flush_guid_inuse(uint64_t spa_guid) +{ + mutex_enter(&arc_async_flush_lock); + for (arc_async_flush_t *af = list_head(&arc_async_flush_list); + af != NULL; af = list_next(&arc_async_flush_list, af)) { + if (af->af_spa_guid == spa_guid) { + mutex_exit(&arc_async_flush_lock); + return (B_TRUE); + } + } + mutex_exit(&arc_async_flush_lock); + return (B_FALSE); +} + uint64_t arc_reduce_target_size(uint64_t to_free) { /* * Get the actual arc size. Even if we don't need it, this updates * the aggsum lower bound estimate for arc_is_overflowing(). */ uint64_t asize = aggsum_value(&arc_sums.arcstat_size); /* * All callers want the ARC to actually evict (at least) this much * memory. Therefore we reduce from the lower of the current size and * the target size. This way, even if arc_c is much higher than * arc_size (as can be the case after many calls to arc_freed(), we will * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ uint64_t c = arc_c; if (c > arc_c_min) { c = MIN(c, MAX(asize, arc_c_min)); to_free = MIN(to_free, c - arc_c_min); arc_c = c - to_free; } else { to_free = 0; } /* * Whether or not we reduced the target size, request eviction if the * current size is over it now, since caller obviously wants some RAM. */ if (asize > arc_c) { /* See comment in arc_evict_cb_check() on why lock+flag */ mutex_enter(&arc_evict_lock); arc_evict_needed = B_TRUE; mutex_exit(&arc_evict_lock); zthr_wakeup(arc_evict_zthr); } return (to_free); } /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } void arc_kmem_reap_soon(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; #ifdef _KERNEL #if defined(_ILP32) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { #if defined(_ILP32) /* reach upper limit of cache size on 32-bit */ if (zio_buf_cache[i] == NULL) break; #endif if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); kmem_cache_reap_now(zfs_btree_leaf_cache); abd_cache_reap_now(); } static boolean_t arc_evict_cb_check(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; #ifdef ZFS_DEBUG /* * This is necessary in order to keep the kstat information * up to date for tools that display kstat data such as the * mdb ::arc dcmd and the Linux crash utility. These tools * typically do not call kstat's update function, but simply * dump out stats from the most recent update. Without * this call, these commands may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this call, the data might be out of date if the * evict thread hasn't been woken recently; but that should * suffice. The arc_state_t structures can be queried * directly if more accurate information is needed. */ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); #endif /* * We have to rely on arc_wait_for_eviction() to tell us when to * evict, rather than checking if we are overflowing here, so that we * are sure to not leave arc_wait_for_eviction() waiting on aew_cv. * If we have become "not overflowing" since arc_wait_for_eviction() * checked, we need to wake it up. We could broadcast the CV here, * but arc_wait_for_eviction() may have not yet gone to sleep. We * would need to use a mutex to ensure that this function doesn't * broadcast until arc_wait_for_eviction() has gone to sleep (e.g. * the arc_evict_lock). However, the lock ordering of such a lock * would necessarily be incorrect with respect to the zthr_lock, * which is held before this function is called, and is held by * arc_wait_for_eviction() when it calls zthr_wakeup(). */ if (arc_evict_needed) return (B_TRUE); /* * If we have buffers in uncached state, evict them periodically. */ return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) && ddi_get_lbolt() - arc_last_uncached_flush > MSEC_TO_TICK(arc_min_prefetch_ms / 2))); } /* * Keep arc_size under arc_c by running arc_evict which evicts data * from the ARC. */ static void arc_evict_cb(void *arg, zthr_t *zthr) { (void) arg; uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); /* Always try to evict from uncached state. */ arc_last_uncached_flush = ddi_get_lbolt(); evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE); evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE); /* Evict from other states only if told to. */ if (arc_evict_needed) evicted += arc_evict(); /* * If evicted is zero, we couldn't evict anything * via arc_evict(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. Additionally, zthr_iscancelled() is * checked here so that if the arc is shutting down, the * broadcast will wake any remaining arc evict waiters. * * Note we cancel using zthr instead of arc_evict_zthr * because the latter may not yet be initializd when the * callback is first invoked. */ mutex_enter(&arc_evict_lock); arc_evict_needed = !zthr_iscancelled(zthr) && evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; if (!arc_evict_needed) { /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * arc_get_data_impl() sooner. */ arc_evict_waiter_t *aw; while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { cv_broadcast(&aw->aew_cv); } arc_set_need_free(); } mutex_exit(&arc_evict_lock); spl_fstrans_unmark(cookie); } static boolean_t arc_reap_cb_check(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; int64_t free_memory = arc_available_memory(); static int reap_cb_check_counter = 0; /* * If a kmem reap is already active, don't schedule more. We must * check for this because kmem_cache_reap_soon() won't actually * block on the cache being reaped (this is to prevent callers from * becoming implicitly blocked by a system-wide kmem reap -- which, * on a system with many, many full magazines, can take minutes). */ if (!kmem_cache_reap_active() && free_memory < 0) { arc_no_grow = B_TRUE; arc_warm = B_TRUE; /* * Wait at least zfs_grow_retry (default 5) seconds * before considering growing. */ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); return (B_TRUE); } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (gethrtime() >= arc_growtime) { arc_no_grow = B_FALSE; } /* * Called unconditionally every 60 seconds to reclaim unused * zstd compression and decompression context. This is done * here to avoid the need for an independent thread. */ if (!((reap_cb_check_counter++) % 60)) zfs_zstd_cache_reap_now(); return (B_FALSE); } /* * Keep enough free memory in the system by reaping the ARC's kmem * caches. To cause more slabs to be reapable, we may reduce the * target size of the cache (arc_c), causing the arc_evict_cb() * to free more buffers. */ static void arc_reap_cb(void *arg, zthr_t *zthr) { int64_t can_free, free_memory, to_free; (void) arg, (void) zthr; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * Kick off asynchronous kmem_reap()'s of all our caches. */ arc_kmem_reap_soon(); /* * Wait at least arc_kmem_cache_reap_retry_ms between * arc_kmem_reap_soon() calls. Without this check it is possible to * end up in a situation where we spend lots of time reaping * caches, while we're near arc_c_min. Waiting here also gives the * subsequent free memory check a chance of finding that the * asynchronous reap has already freed enough memory, and we don't * need to call arc_reduce_target_size(). */ delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); /* * Reduce the target size as needed to maintain the amount of free * memory in the system at a fraction of the arc_size (1/128th by * default). If oversubscribed (free_memory < 0) then reduce the * target arc_size by the deficit amount plus the fractional * amount. If free memory is positive but less than the fractional * amount, reduce by what is needed to hit the fractional amount. */ free_memory = arc_available_memory(); can_free = arc_c - arc_c_min; to_free = (MAX(can_free, 0) >> arc_shrink_shift) - free_memory; if (to_free > 0) arc_reduce_target_size(to_free); spl_fstrans_unmark(cookie); } #ifdef _KERNEL /* * Determine the amount of memory eligible for eviction contained in the * ARC. All clean data reported by the ghost lists can always be safely * evicted. Due to arc_c_min, the same does not hold for all clean data * contained by the regular mru and mfu lists. * * In the case of the regular mru and mfu lists, we need to report as * much clean data as possible, such that evicting that same reported * data will not bring arc_size below arc_c_min. Thus, in certain * circumstances, the total amount of clean data in the mru and mfu * lists might not actually be evictable. * * The following two distinct cases are accounted for: * * 1. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is greater than or equal to arc_c_min. * (i.e. amount of dirty data >= arc_c_min) * * This is the easy case; all clean data contained by the mru and mfu * lists is evictable. Evicting all clean data can only drop arc_size * to the amount of dirty data, which is greater than arc_c_min. * * 2. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is less than arc_c_min. * (i.e. arc_c_min > amount of dirty data) * * 2.1. arc_size is greater than or equal arc_c_min. * (i.e. arc_size >= arc_c_min > amount of dirty data) * * In this case, not all clean data from the regular mru and mfu * lists is actually evictable; we must leave enough clean data * to keep arc_size above arc_c_min. Thus, the maximum amount of * evictable data from the two lists combined, is exactly the * difference between arc_size and arc_c_min. * * 2.2. arc_size is less than arc_c_min * (i.e. arc_c_min > arc_size > amount of dirty data) * * In this case, none of the data contained in the mru and mfu * lists is evictable, even if it's clean. Since arc_size is * already below arc_c_min, evicting any more would only * increase this negative difference. */ #endif /* _KERNEL */ /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are coming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(uint64_t bytes) { /* * Wake reap thread if we do not have any available memory */ if (arc_reclaim_needed()) { zthr_wakeup(arc_reap_zthr); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ if (aggsum_upper_bound(&arc_sums.arcstat_size) + 2 * SPA_MAXBLOCKSIZE >= arc_c) { uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE); if (atomic_add_64_nv(&arc_c, dc) > arc_c_max) arc_c = arc_c_max; } } /* * Check if ARC current size has grown past our upper thresholds. */ static arc_ovf_level_t arc_is_overflowing(boolean_t lax, boolean_t use_reserve) { /* * We just compare the lower bound here for performance reasons. Our * primary goals are to make sure that the arc never grows without * bound, and that it can reach its maximum size. This check * accomplishes both goals. The maximum amount we could run over by is * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - zfs_max_recordsize; /* Always allow at least one block of overflow. */ if (over < 0) return (ARC_OVF_NONE); /* If we are under memory pressure, report severe overflow. */ if (!lax) return (ARC_OVF_SEVERE); /* We are not under pressure, so be more or less relaxed. */ int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2; if (use_reserve) overflow *= 3; return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, alloc_flags); if (alloc_flags & ARC_HDR_ALLOC_LINEAR) return (abd_alloc_linear(size, type == ARC_BUFC_METADATA)); else return (abd_alloc(size, type == ARC_BUFC_METADATA)); } static void * arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, 0); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { ASSERT(type == ARC_BUFC_DATA); return (zio_data_buf_alloc(size)); } } /* * Wait for the specified amount of data (in bytes) to be evicted from the * ARC, and for there to be sufficient free memory in the system. * The lax argument specifies that caller does not have a specific reason * to wait, not aware of any memory pressure. Low memory handlers though * should set it to B_FALSE to wait for all required evictions to complete. * The use_reserve argument allows some callers to wait less than others * to not block critical code paths, possibly blocking other resources. */ void arc_wait_for_eviction(uint64_t amount, boolean_t lax, boolean_t use_reserve) { switch (arc_is_overflowing(lax, use_reserve)) { case ARC_OVF_NONE: return; case ARC_OVF_SOME: /* * This is a bit racy without taking arc_evict_lock, but the * worst that can happen is we either call zthr_wakeup() extra * time due to race with other thread here, or the set flag * get cleared by arc_evict_cb(), which is unlikely due to * big hysteresis, but also not important since at this level * of overflow the eviction is purely advisory. Same time * taking the global lock here every time without waiting for * the actual eviction creates a significant lock contention. */ if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } return; case ARC_OVF_SEVERE: default: { arc_evict_waiter_t aw; list_link_init(&aw.aew_node); cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); uint64_t last_count = 0; mutex_enter(&arc_evict_lock); if (!list_is_empty(&arc_evict_waiters)) { arc_evict_waiter_t *last = list_tail(&arc_evict_waiters); last_count = last->aew_count; } else if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } /* * Note, the last waiter's count may be less than * arc_evict_count if we are low on memory in which * case arc_evict_state_impl() may have deferred * wakeups (but still incremented arc_evict_count). */ aw.aew_count = MAX(last_count, arc_evict_count) + amount; list_insert_tail(&arc_evict_waiters, &aw); arc_set_need_free(); DTRACE_PROBE3(arc__wait__for__eviction, uint64_t, amount, uint64_t, arc_evict_count, uint64_t, aw.aew_count); /* * We will be woken up either when arc_evict_count reaches * aew_count, or when the ARC is no longer overflowing and * eviction completes. * In case of "false" wakeup, we will still be on the list. */ do { cv_wait(&aw.aew_cv, &arc_evict_lock); } while (list_link_active(&aw.aew_node)); mutex_exit(&arc_evict_lock); cv_destroy(&aw.aew_cv); } } } /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { arc_adapt(size); /* * If arc_size is currently overflowing, we must be adding data * faster than we are evicting. To ensure we don't compound the * problem by adding more data and forcing arc_size to grow even * further past it's target size, we wait for the eviction thread to * make some progress. We also wait for there to be sufficient free * memory in the system, as measured by arc_free_memory(). * * Specifically, we wait for zfs_arc_eviction_pct percent of the * requested size to be evicted. This should be more than 100%, to * ensure that that progress is also made towards getting arc_size * under arc_c. See the comment above zfs_arc_eviction_pct. */ arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, B_TRUE, alloc_flags & ARC_HDR_USE_RESERVE); arc_buf_contents_t type = arc_buf_type(hdr); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { arc_space_consume(size, ARC_SPACE_DATA); } /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ arc_state_t *state = hdr->b_l1hdr.b_state; if (!GHOST_STATE(state)) { (void) zfs_refcount_add_many(&state->arcs_size[type], size, tag); /* * If this is reached via arc_read, the link is * protected by the hash lock. If reached via * arc_buf_alloc, the header should not be accessed by * any other thread. And, if reached via arc_read_done, * the hash lock will protect it if it's found in the * hash table; otherwise no other thread should be * trying to [add|remove]_reference it. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); (void) zfs_refcount_add_many(&state->arcs_esize[type], size, tag); } } } static void arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, const void *tag) { arc_free_data_impl(hdr, size, tag); abd_free(abd); } static void arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_free_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { zio_buf_free(buf, size); } else { ASSERT(type == ARC_BUFC_DATA); zio_data_buf_free(buf, size); } } /* * Free the arc data buffer. */ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, tag); } (void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } } /* * This routine is called whenever a buffer is accessed. */ static void arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); /* * Update buffer prefetch status. */ boolean_t was_prefetch = HDR_PREFETCH(hdr); boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH; if (was_prefetch != now_prefetch) { if (was_prefetch) { ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit, HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive, prefetch); } if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); if (was_prefetch) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); } else { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } if (now_prefetch) { if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) { arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); ARCSTAT_BUMP(arcstat_prescient_prefetch); } else { ARCSTAT_BUMP(arcstat_predictive_prefetch); } } if (arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); clock_t now = ddi_get_lbolt(); if (hdr->b_l1hdr.b_state == arc_anon) { arc_state_t *new_state; /* * This buffer is not in the cache, and does not appear in * our "ghost" lists. Add it to the MRU or uncached state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = now; if (HDR_UNCACHED(hdr)) { new_state = arc_uncached; DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *, hdr); } else { new_state = arc_mru; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mru) { /* * This buffer has been accessed once recently and either * its read is still in progress or it is in the cache. */ if (HDR_IO_IN_PROGRESS(hdr)) { hdr->b_l1hdr.b_arc_access = now; return; } hdr->b_l1hdr.b_mru_hits++; ARCSTAT_BUMP(arcstat_mru_hits); /* * If the previous access was a prefetch, then it already * handled possible promotion, so nothing more to do for now. */ if (was_prefetch) { hdr->b_l1hdr.b_arc_access = now; return; } /* * If more than ARC_MINTIME have passed from the previous * hit, promote the buffer to the MFU state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); } } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been accessed once recently, but was * evicted from the cache. Would we have bigger MRU, it * would be an MRU hit, so handle it the same way, except * we don't need to check the previous access time. */ hdr->b_l1hdr.b_mru_ghost_hits++; ARCSTAT_BUMP(arcstat_mru_ghost_hits); hdr->b_l1hdr.b_arc_access = now; wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)], arc_hdr_size(hdr)); if (was_prefetch) { new_state = arc_mru; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and either * still in the cache or being restored from one of ghosts. */ if (!HDR_IO_IN_PROGRESS(hdr)) { hdr->b_l1hdr.b_mfu_hits++; ARCSTAT_BUMP(arcstat_mfu_hits); } hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { /* * This buffer has been accessed more than once recently, but * has been evicted from the cache. Would we have bigger MFU * it would stay in cache, so move it back to MFU state. */ hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); hdr->b_l1hdr.b_arc_access = now; wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)], arc_hdr_size(hdr)); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr); } else if (hdr->b_l1hdr.b_state == arc_uncached) { /* * This buffer is uncacheable, but we got a hit. Probably * a demand read after prefetch. Nothing more to do here. */ if (!HDR_IO_IN_PROGRESS(hdr)) ARCSTAT_BUMP(arcstat_uncached_hits); hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC and was not accessed * for a long time, so treat it as new and put into MRU. */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); } } /* * This routine is called by dbuf_hold() to update the arc_access() state * which otherwise would be skipped for entries in the dbuf cache. */ void arc_buf_access(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Avoid taking the hash_lock when possible as an optimization. * The header must be checked again under the hash_lock in order * to handle the case where it is concurrently being released. */ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) return; kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_access_skip); return; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu || hdr->b_l1hdr.b_state == arc_uncached); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, 0, B_TRUE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ void arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { (void) zio, (void) zb, (void) bp; if (buf == NULL) return; memcpy(arg, buf->b_data, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } /* a generic arc_read_done_func_t */ void arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { (void) zb, (void) bp; arc_buf_t **bufp = arg; if (buf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; } else { ASSERT(zio == NULL || zio->io_error == 0); *bufp = buf; ASSERT(buf->b_data != NULL); } } static void arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { ASSERT3U(arc_hdr_get_compress(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp)); } } static void arc_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; arc_callback_t *acb; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); ASSERT3P(hash_lock, !=, NULL); } if (BP_IS_PROTECTED(bp)) { hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); if (zio->io_error == 0) { if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { void *tmpbuf; tmpbuf = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmpbuf, hdr->b_crypt_hdr.b_mac); abd_return_buf(zio->io_abd, tmpbuf, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } } } if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } if (!HDR_L2_READING(hdr)) { hdr->b_complevel = zio->io_prop.zp_complevel; } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); hdr->b_l1hdr.b_acb = NULL; /* * If a read request has a callback (i.e. acb_done is not NULL), then we * make a buf containing the data according to the parameters which were * passed in. The implementation of arc_buf_alloc_impl() ensures that we * aren't needlessly decompressing the data multiple times. */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { /* We need the last one to call below in original order. */ callback_list = acb; if (!acb->acb_done || acb->acb_nobuf) continue; callback_cnt++; if (zio->io_error != 0) continue; int error = arc_buf_alloc_impl(hdr, zio->io_spa, &acb->acb_zb, acb->acb_private, acb->acb_encrypted, acb->acb_compressed, acb->acb_noauth, B_TRUE, &acb->acb_buf); /* * Assert non-speculative zios didn't fail because an * encryption key wasn't loaded */ ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) || error != EACCES); /* * If we failed to decrypt, report an error now (as the zio * layer would have done if it had done the transforms). */ if (error == ECKSUM) { ASSERT(BP_IS_PROTECTED(bp)); error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb, BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0); } } if (error != 0) { /* * Decompression or decryption failed. Set * io_error so that when we call acb_done * (below), we will indicate that the read * failed. Note that in the unusual case * where one callback is compressed and another * uncompressed, we will mark all of them * as failed, even though the uncompressed * one can't actually fail. In this case, * the hdr will not be anonymous, because * if there are multiple callbacks, it's * because multiple threads found the same * arc buf in the hash table. */ zio->io_error = error; } } /* * If there are multiple callbacks, we must have the hash lock, * because the only way for multiple threads to find this hdr is * in the hash table. This ensures that if there are multiple * callbacks, the hdr is not anonymous. If it were anonymous, * we couldn't use arc_buf_destroy() in the error case below. */ ASSERT(callback_cnt < 2 || hash_lock != NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); (void) remove_reference(hdr, hdr); if (hash_lock != NULL) mutex_exit(hash_lock); /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done != NULL) { if (zio->io_error != 0 && acb->acb_buf != NULL) { /* * If arc_buf_alloc_impl() fails during * decompression, the buf will still be * allocated, and needs to be freed here. */ arc_buf_destroy(acb->acb_buf, acb->acb_private); acb->acb_buf = NULL; } acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, acb->acb_buf, acb->acb_private); } if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_prev; if (acb->acb_wait) { mutex_enter(&acb->acb_wait_lock); acb->acb_wait_error = zio->io_error; acb->acb_wait = B_FALSE; cv_signal(&acb->acb_wait_cv); mutex_exit(&acb->acb_wait_lock); /* acb will be freed by the waiting thread. */ } else { kmem_free(acb, sizeof (arc_callback_t)); } } } /* * Lookup the block at the specified DVA (in bp), and return the manner in * which the block is cached. A zero return indicates not cached. */ int arc_cached(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; uint64_t guid = spa_load_guid(spa); int flags = 0; if (BP_IS_EMBEDDED(bp)) return (ARC_CACHED_EMBEDDED); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return (0); if (HDR_HAS_L1HDR(hdr)) { arc_state_t *state = hdr->b_l1hdr.b_state; /* * We switch to ensure that any future arc_state_type_t * changes are handled. This is just a shift to promote * more compile-time checking. */ switch (state->arcs_state) { case ARC_STATE_ANON: break; case ARC_STATE_MRU: flags |= ARC_CACHED_IN_MRU | ARC_CACHED_IN_L1; break; case ARC_STATE_MFU: flags |= ARC_CACHED_IN_MFU | ARC_CACHED_IN_L1; break; case ARC_STATE_UNCACHED: /* The header is still in L1, probably not for long */ flags |= ARC_CACHED_IN_L1; break; default: break; } } if (HDR_HAS_L2HDR(hdr)) flags |= ARC_CACHED_IN_L2; mutex_exit(hash_lock); return (flags); } /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0; boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; arc_buf_t *buf = NULL; int rc = 0; ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_REDACTED(bp)); /* * Normally SPL_FSTRANS will already be set since kernel threads which * expect to call the DMU interfaces will set it when created. System * calls are similarly handled by setting/cleaning the bit in the * registered callback (module/os/.../zfs/zpl_*). * * External consumers such as Lustre which call the exported DMU * interfaces may not have set SPL_FSTRANS. To avoid a deadlock * on the hash_lock always set and clear the bit. */ fstrans_cookie_t cookie = spl_fstrans_mark(); top: if (!embedded_bp) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } /* * Determine if we have an L1 cache hit or a cache miss. For simplicity * we maintain encrypted data separately from compressed / uncompressed * data. If the user is requesting raw encrypted data and we don't have * that in the header we will read from disk to guarantee that we can * get it even if the encryption keys aren't loaded. */ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { boolean_t is_data = !HDR_ISTYPE_METADATA(hdr); /* * Verify the block pointer contents are reasonable. This * should always be the case since the blkptr is protected by * a checksum. */ if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP, BLK_VERIFY_LOG)) { mutex_exit(hash_lock); rc = SET_ERROR(ECKSUM); goto done; } if (HDR_IO_IN_PROGRESS(hdr)) { if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); rc = SET_ERROR(ENOENT); goto done; } zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* * This is a sync read that needs to wait for * an in-flight async read. Request that the * zio have its priority upgraded. */ zio_change_priority(head_zio, priority); DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr); arc_access(hdr, *arc_flags, B_FALSE); /* * If there are multiple threads reading the same block * and that block is not yet in the ARC, then only one * thread will do the physical I/O and all other * threads will wait until that I/O completes. * Synchronous reads use the acb_wait_cv whereas nowait * reads register a callback. Both are signalled/called * in arc_read_done. * * Errors of the physical I/O may need to be propagated. * Synchronous read errors are returned here from * arc_read_done via acb_wait_error. Nowait reads * attach the acb_zio_dummy zio to pio and * arc_read_done propagates the physical I/O's io_error * to acb_zio_dummy, and thereby to pio. */ arc_callback_t *acb = NULL; if (done || pio || *arc_flags & ARC_FLAG_WAIT) { acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_nobuf = no_buf; if (*arc_flags & ARC_FLAG_WAIT) { acb->acb_wait = B_TRUE; mutex_init(&acb->acb_wait_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT, NULL); } acb->acb_zb = *zb; if (pio != NULL) { acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); } acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb->acb_prev = acb; hdr->b_l1hdr.b_acb = acb; } mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_iohits); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, is_data, data, metadata, iohits); if (*arc_flags & ARC_FLAG_WAIT) { mutex_enter(&acb->acb_wait_lock); while (acb->acb_wait) { cv_wait(&acb->acb_wait_cv, &acb->acb_wait_lock); } rc = acb->acb_wait_error; mutex_exit(&acb->acb_wait_lock); mutex_destroy(&acb->acb_wait_lock); cv_destroy(&acb->acb_wait_cv); kmem_free(acb, sizeof (arc_callback_t)); } goto out; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu || hdr->b_l1hdr.b_state == arc_uncached); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, *arc_flags, B_TRUE); if (done && !no_buf) { ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ rc = arc_buf_alloc_impl(hdr, spa, zb, private, encrypted_read, compressed_read, noauth_read, B_TRUE, &buf); if (rc == ECKSUM) { /* * Convert authentication and decryption errors * to EIO (and generate an ereport if needed) * before leaving the ARC. */ rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, zb, hdr->b_birth); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } } if (rc != 0) { arc_buf_destroy_impl(buf); buf = NULL; (void) remove_reference(hdr, private); } /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); } mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, is_data, data, metadata, hits); *arc_flags |= ARC_FLAG_CACHED; goto done; } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); if (*arc_flags & ARC_FLAG_CACHED_ONLY) { if (hash_lock != NULL) mutex_exit(hash_lock); rc = SET_ERROR(ENOENT); goto done; } /* * Verify the block pointer contents are reasonable. This * should always be the case since the blkptr is protected by * a checksum. */ if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ? BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { if (hash_lock != NULL) mutex_exit(hash_lock); rc = SET_ERROR(ECKSUM); goto done; } if (hdr == NULL) { /* * This block is not in the cache or it has * embedded data. */ arc_buf_hdr_t *exists = NULL; hdr = arc_hdr_alloc(guid, psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_GET_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } } else { /* * This block is in the ghost cache or encrypted data * was requested and we didn't have it. If it was * L2-only (and thus didn't have an L1 hdr), * we realloc the header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } if (GHOST_STATE(hdr->b_l1hdr.b_state)) { ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT0(zfs_refcount_count( &hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); #ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); #endif } else if (HDR_IO_IN_PROGRESS(hdr)) { /* * If this header already had an IO in progress * and we are performing another IO to fetch * encrypted data we must wait until the first * IO completes so as not to confuse * arc_read_done(). This should be very rare * and so the performance impact shouldn't * matter. */ arc_callback_t *acb = kmem_zalloc( sizeof (arc_callback_t), KM_SLEEP); acb->acb_wait = B_TRUE; mutex_init(&acb->acb_wait_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT, NULL); acb->acb_zio_head = hdr->b_l1hdr.b_acb->acb_zio_head; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb->acb_prev = acb; hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); mutex_enter(&acb->acb_wait_lock); while (acb->acb_wait) { cv_wait(&acb->acb_wait_cv, &acb->acb_wait_lock); } mutex_exit(&acb->acb_wait_lock); mutex_destroy(&acb->acb_wait_lock); cv_destroy(&acb->acb_wait_cv); kmem_free(acb, sizeof (arc_callback_t)); goto top; } } if (*arc_flags & ARC_FLAG_UNCACHED) { arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); if (!encrypted_read) alloc_flags |= ARC_HDR_ALLOC_LINEAR; } /* * Take additional reference for IO_IN_PROGRESS. It stops * arc_access() from putting this header without any buffers * and so other references but obviously nonevictable onto * the evictable list of MRU or MFU state. */ add_reference(hdr, hdr); if (!embedded_bp) arc_access(hdr, *arc_flags, B_FALSE); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); size = HDR_GET_PSIZE(hdr); hdr_abd = hdr->b_crypt_hdr.b_rabd; zio_flags |= ZIO_FLAG_RAW; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); hdr_abd = hdr->b_l1hdr.b_pabd; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { zio_flags |= ZIO_FLAG_RAW_COMPRESS; } /* * For authenticated bp's, we do not ask the ZIO layer * to authenticate them since this will cause the entire * IO to fail if the key isn't loaded. Instead, we * defer authentication until arc_buf_fill(), which will * verify the data when the key is available. */ if (BP_IS_AUTHENTICATED(bp)) zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } if (BP_IS_AUTHENTICATED(bp)) arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_zb = *zb; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } /* * We count both async reads and scrub IOs as asynchronous so * that both can be upgraded in the event of a cache hit while * the read IO is still in-flight. */ if (priority == ZIO_PRIORITY_ASYNC_READ || priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); /* * At this point, we have a level 1 cache miss or a blkptr * with embedded data. Try again in L2ARC if possible. */ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); /* * Skip ARC stat bump for block pointers with embedded * data. The data are read from the blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); zfs_racct_read(spa, size, 1, 0); } /* Check if the spa even has l2 configured */ const boolean_t spa_has_l2 = l2arc_ndev != 0 && spa->spa_l2cache.sav_count > 0; if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); hdr->b_l2hdr.b_hits++; cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; /* * When Compressed ARC is disabled, but the * L2ARC block is compressed, arc_hdr_size() * will have returned LSIZE rather than PSIZE. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr) && HDR_GET_PSIZE(hdr) != 0) { size = HDR_GET_PSIZE(hdr); } asize = vdev_psize_to_asize(vd, size); if (asize != size) { abd = abd_alloc_for_io(asize, HDR_ISTYPE_METADATA(hdr)); cb->l2rcb_abd = abd; } else { abd = hdr_abd; } ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + asize <= vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ ASSERT3U(arc_hdr_get_compress(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, HDR_GET_PSIZE(hdr)); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) goto out; /* l2arc read error; goto zio_read() */ if (hash_lock != NULL) mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); /* * Only a spa with l2 should contribute to l2 * miss stats. (Including the case of having a * faulted cache device - that's also a miss.) */ if (spa_has_l2) { /* * Skip ARC stat bump for block pointers with * embedded data. The data are read from the * blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } } rzio = zio_read(pio, spa, bp, hdr_abd, size, arc_read_done, hdr, priority, zio_flags, zb); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } out: /* embedded bps don't actually go to disk */ if (!embedded_bp) spa_read_history_add(spa, zb, *arc_flags); spl_fstrans_unmark(cookie); return (rc); done: if (done) done(NULL, zb, bp, buf, private); if (pio && rc != 0) { zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); zio->io_error = rc; zio_nowait(zio); } goto out; } arc_prune_t * arc_add_prune_callback(arc_prune_func_t *func, void *private) { arc_prune_t *p; p = kmem_alloc(sizeof (*p), KM_SLEEP); p->p_pfunc = func; p->p_private = private; list_link_init(&p->p_node); zfs_refcount_create(&p->p_refcnt); mutex_enter(&arc_prune_mtx); zfs_refcount_add(&p->p_refcnt, &arc_prune_list); list_insert_head(&arc_prune_list, p); mutex_exit(&arc_prune_mtx); return (p); } void arc_remove_prune_callback(arc_prune_t *p) { boolean_t wait = B_FALSE; mutex_enter(&arc_prune_mtx); list_remove(&arc_prune_list, p); if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) wait = B_TRUE; mutex_exit(&arc_prune_mtx); /* wait for arc_prune_task to finish */ if (wait) taskq_wait_outstanding(arc_prune_taskq, 0); ASSERT0(zfs_refcount_count(&p->p_refcnt)); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } /* * Helper function for arc_prune_async() it is responsible for safely * handling the execution of a registered arc_prune_func_t. */ static void arc_prune_task(void *ptr) { arc_prune_t *ap = (arc_prune_t *)ptr; arc_prune_func_t *func = ap->p_pfunc; if (func != NULL) func(ap->p_adjust, ap->p_private); (void) zfs_refcount_remove(&ap->p_refcnt, func); } /* * Notify registered consumers they must drop holds on a portion of the ARC * buffers they reference. This provides a mechanism to ensure the ARC can * honor the metadata limit and reclaim otherwise pinned ARC buffers. * * This operation is performed asynchronously so it may be safely called * in the context of the arc_reclaim_thread(). A reference is taken here * for each registered arc_prune_t and the arc_prune_task() is responsible * for releasing it once the registered arc_prune_func_t has completed. */ static void arc_prune_async(uint64_t adjust) { arc_prune_t *ap; mutex_enter(&arc_prune_mtx); for (ap = list_head(&arc_prune_list); ap != NULL; ap = list_next(&arc_prune_list, ap)) { if (zfs_refcount_count(&ap->p_refcnt) >= 2) continue; zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); ap->p_adjust = adjust; if (taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP) == TASKQID_INVALID) { (void) zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); continue; } ARCSTAT_BUMP(arcstat_prune); } mutex_exit(&arc_prune_mtx); } /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; /* * We might be trying to free a block that is still doing I/O * (i.e. prefetch) or has some other reference (i.e. a dedup-ed, * dmu_sync-ed block). A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU * state and discoverable. When the txg finally syncs it detects that * the block was overridden in open context and issues an override I/O. * Since this is a dedup block, the override I/O will determine if the * block is already in the DDT. If so, then it will replace the io_bp * with the bp from the DDT and allow the I/O to finish. When the I/O * reaches the done callback, dbuf_write_override_done, it will * check to see if the io_bp and io_bp_override are identical. * If they are not, then it indicates that the bp was replaced with * the bp in the DDT and the override bp is freed. This allows * us to arrive here with a reference on a block that is being * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ if (!HDR_HAS_L1HDR(hdr) || zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if its DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); ASSERT(ARC_BUF_LAST(buf)); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; /* * If the buf is being overridden then it may already * have a hdr that is not empty. */ buf_discard_identity(hdr); arc_buf_thaw(buf); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); /* * This assignment is only valid as long as the hash_lock is * held, we must be careful not to reference state or the * b_state field after dropping the lock. */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); /* * We have to recheck this conditional again now that * we're holding the l2ad_mtx to prevent a race with * another thread which might be concurrently calling * l2arc_evict(). In that case, l2arc_evict() might have * destroyed the header's L2 portion as we were waiting * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) arc_hdr_l2hdr_destroy(hdr); mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); } /* * Do we have more than one buf? */ if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); boolean_t protected = HDR_PROTECTED(hdr); enum zio_compress compress = arc_hdr_get_compress(hdr); arc_buf_contents_t type = arc_buf_type(hdr); VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); VERIFY3S(remove_reference(hdr, tag), >, 0); if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ if (ARC_BUF_SHARED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between * buf and the arc_buf_hdr_t. */ arc_unshare_buf(hdr, buf); /* * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { arc_hdr_alloc_abd(hdr, 0); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); ASSERT(!arc_buf_is_shared(buf)); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_size[type], arc_buf_size(buf), buf); if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many( &state->arcs_esize[type], arc_buf_size(buf), buf); } arc_cksum_verify(buf); arc_buf_unwatch(buf); /* if this is the last uncompressed buf free the checksum */ if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); mutex_exit(hash_lock); nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; (void) zfs_refcount_add_many(&arc_anon->arcs_size[type], arc_buf_size(buf), buf); } else { ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; arc_change_state(arc_anon, hdr); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } } int arc_released(arc_buf_t *buf) { return (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp); fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); callback->awcb_ready(zio, buf, callback->awcb_private); if (HDR_IO_IN_PROGRESS(hdr)) { ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ } if (BP_IS_PROTECTED(bp)) { /* ZIL blocks are written through zio_rewrite */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); if (BP_SHOULD_BYTESWAP(bp)) { if (BP_GET_LEVEL(bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED); } /* * If this block was written for raw encryption but the zio layer * ended up only authenticating it, adjust the buffer flags now. */ if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) { arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF) buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) { buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } /* this must be done after the buffer flags are adjusted */ arc_cksum_compute(buf); enum zio_compress compress; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); compress = BP_GET_COMPRESS(bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = zio->io_prop.zp_complevel; if (zio->io_error != 0 || psize == 0) goto out; /* * Fill the hdr with data. If the buffer is encrypted we have no choice * but to copy the data into b_radb. If the hdr is compressed, the data * we want is available from the zio, otherwise we can take it from * the buf. * * We might be able to share the buf's data with the hdr here. However, * doing so would cause the ARC to be full of linear ABDs if we write a * lot of shareable data. As a compromise, we check whether scattered * ABDs are allowed, and assume that if they are then the user wants * the ARC to be primarily filled with them regardless of the data being * written. Therefore, if they're allowed then we allocate one and copy * the data into it; otherwise, we share the data directly if we can. */ if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (!(HDR_UNCACHED(hdr) || abd_size_alloc_linear(arc_buf_size(buf))) || !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the * hdr's compression setting rather than the io_bp's. */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); ASSERT(ARC_BUF_LAST(buf)); arc_share_buf(hdr, buf); } out: arc_hdr_verify(hdr, bp); spl_fstrans_unmark(cookie); } static void arc_write_children_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; callback->awcb_children_ready(zio, buf, callback->awcb_private); } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_GET_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT3U(zio->io_error, ==, 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists); arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); VERIFY3S(remove_reference(hdr, hdr), >, 0); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, 0, B_FALSE); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); VERIFY3S(remove_reference(hdr, hdr), >, 0); } callback->awcb_done(zio, buf, callback->awcb_private); abd_free(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *children_ready, arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; zio_prop_t localprop = *zp; ASSERT3P(ready, !=, NULL); ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); if (uncached) arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); else if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (ARC_BUF_ENCRYPTED(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); localprop.zp_encrypt = B_TRUE; localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; localprop.zp_byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { localprop.zp_nopwrite = B_FALSE; localprop.zp_copies = MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (ARC_BUF_COMPRESSED(buf)) { ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; /* * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); if (!(zio_flags & ZIO_FLAG_RAW)) arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); zio = zio_write(pio, spa, txg, bp, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_done, callback, priority, zio_flags, zb); return (zio); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (!arc_no_grow && reserve > arc_c/4 && reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT)) arc_c = MIN(arc_c_max, reserve * 4); /* * Throttle when the calculated memory footprint for the TXG * exceeds the target ARC size. */ if (reserve > arc_c) { DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); return (SET_ERROR(ERESTART)); } /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); anon_size = MAX((int64_t) (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(spa, reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * * In the case of one pool being built on another pool, we want * to make sure we don't end up throttling the lower (backing) * pool when the upper pool is the majority contributor to dirty * data. To insure we make forward progress during throttling, we * also check the current pool's net dirty data and only throttle * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty * data in the cache. * * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ uint64_t total_dirty = reserve + arc_tempreserve + anon_size; uint64_t spa_dirty_anon = spa_dirty_data(spa); uint64_t rarc_c = arc_warm ? arc_c : arc_c_max; if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 && anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 && spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { #ifdef ZFS_DEBUG uint64_t meta_esize = zfs_refcount_count( &arc_anon->arcs_esize[ARC_BUFC_METADATA]); uint64_t data_esize = zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n", (u_longlong_t)arc_tempreserve >> 10, (u_longlong_t)meta_esize >> 10, (u_longlong_t)data_esize >> 10, (u_longlong_t)reserve >> 10, (u_longlong_t)rarc_c >> 10); #endif DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *data, kstat_named_t *metadata, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { data->value.ui64 = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]); metadata->value.ui64 = zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); size->value.ui64 = data->value.ui64 + metadata->value.ui64; evict_data->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); as->arcstat_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_hits); as->arcstat_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_iohits); as->arcstat_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_misses); as->arcstat_demand_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_hits); as->arcstat_demand_data_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_iohits); as->arcstat_demand_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_misses); as->arcstat_demand_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_hits); as->arcstat_demand_metadata_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_iohits); as->arcstat_demand_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_misses); as->arcstat_prefetch_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_hits); as->arcstat_prefetch_data_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_iohits); as->arcstat_prefetch_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_misses); as->arcstat_prefetch_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits); as->arcstat_prefetch_metadata_iohits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits); as->arcstat_prefetch_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses); as->arcstat_mru_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_hits); as->arcstat_mru_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_ghost_hits); as->arcstat_mfu_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_hits); as->arcstat_mfu_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_ghost_hits); as->arcstat_uncached_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_uncached_hits); as->arcstat_deleted.value.ui64 = wmsum_value(&arc_sums.arcstat_deleted); as->arcstat_mutex_miss.value.ui64 = wmsum_value(&arc_sums.arcstat_mutex_miss); as->arcstat_access_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_access_skip); as->arcstat_evict_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_skip); as->arcstat_evict_not_enough.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_not_enough); as->arcstat_evict_l2_cached.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_cached); as->arcstat_evict_l2_eligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible); as->arcstat_evict_l2_eligible_mfu.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu); as->arcstat_evict_l2_eligible_mru.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru); as->arcstat_evict_l2_ineligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_ineligible); as->arcstat_evict_l2_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_skip); as->arcstat_hash_elements.value.ui64 = as->arcstat_hash_elements_max.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_elements); as->arcstat_hash_collisions.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_collisions); as->arcstat_hash_chains.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_chains); as->arcstat_size.value.ui64 = aggsum_value(&arc_sums.arcstat_size); as->arcstat_compressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_compressed_size); as->arcstat_uncompressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_uncompressed_size); as->arcstat_overhead_size.value.ui64 = wmsum_value(&arc_sums.arcstat_overhead_size); as->arcstat_hdr_size.value.ui64 = wmsum_value(&arc_sums.arcstat_hdr_size); as->arcstat_data_size.value.ui64 = wmsum_value(&arc_sums.arcstat_data_size); as->arcstat_metadata_size.value.ui64 = wmsum_value(&arc_sums.arcstat_metadata_size); as->arcstat_dbuf_size.value.ui64 = wmsum_value(&arc_sums.arcstat_dbuf_size); #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + wmsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, &as->arcstat_anon_data, &as->arcstat_anon_metadata, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, &as->arcstat_mru_data, &as->arcstat_mru_metadata, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, &as->arcstat_mru_ghost_data, &as->arcstat_mru_ghost_metadata, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, &as->arcstat_mfu_data, &as->arcstat_mfu_metadata, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_data, &as->arcstat_mfu_ghost_metadata, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); arc_kstat_update_state(arc_uncached, &as->arcstat_uncached_size, &as->arcstat_uncached_data, &as->arcstat_uncached_metadata, &as->arcstat_uncached_evictable_data, &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = wmsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_hits); as->arcstat_l2_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_misses); as->arcstat_l2_prefetch_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_prefetch_asize); as->arcstat_l2_mru_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mru_asize); as->arcstat_l2_mfu_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mfu_asize); as->arcstat_l2_bufc_data_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize); as->arcstat_l2_bufc_metadata_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize); as->arcstat_l2_feeds.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_feeds); as->arcstat_l2_rw_clash.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rw_clash); as->arcstat_l2_read_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_read_bytes); as->arcstat_l2_write_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_write_bytes); as->arcstat_l2_writes_sent.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_sent); as->arcstat_l2_writes_done.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_done); as->arcstat_l2_writes_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_error); as->arcstat_l2_writes_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry); as->arcstat_l2_evict_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry); as->arcstat_l2_evict_reading.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_reading); as->arcstat_l2_evict_l1cached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_l1cached); as->arcstat_l2_free_on_write.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_free_on_write); as->arcstat_l2_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_abort_lowmem); as->arcstat_l2_cksum_bad.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_cksum_bad); as->arcstat_l2_io_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_io_error); as->arcstat_l2_lsize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_lsize); as->arcstat_l2_psize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_psize); as->arcstat_l2_hdr_size.value.ui64 = aggsum_value(&arc_sums.arcstat_l2_hdr_size); as->arcstat_l2_log_blk_writes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_writes); as->arcstat_l2_log_blk_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_asize); as->arcstat_l2_log_blk_count.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_count); as->arcstat_l2_rebuild_success.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_success); as->arcstat_l2_rebuild_abort_unsupported.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported); as->arcstat_l2_rebuild_abort_io_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors); as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); as->arcstat_l2_rebuild_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem); as->arcstat_l2_rebuild_size.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_size); as->arcstat_l2_rebuild_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_asize); as->arcstat_l2_rebuild_bufs.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs); as->arcstat_l2_rebuild_bufs_precached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached); as->arcstat_l2_rebuild_log_blks.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks); as->arcstat_memory_throttle_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_throttle_count); as->arcstat_memory_direct_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_direct_count); as->arcstat_memory_indirect_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_indirect_count); as->arcstat_memory_all_bytes.value.ui64 = arc_all_memory(); as->arcstat_memory_free_bytes.value.ui64 = arc_free_memory(); as->arcstat_memory_available_bytes.value.i64 = arc_available_memory(); as->arcstat_prune.value.ui64 = wmsum_value(&arc_sums.arcstat_prune); as->arcstat_meta_used.value.ui64 = wmsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); as->arcstat_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_predictive_prefetch); as->arcstat_demand_hit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch); as->arcstat_demand_iohit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch); as->arcstat_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_prescient_prefetch); as->arcstat_demand_hit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch); as->arcstat_demand_iohit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch); as->arcstat_raw_size.value.ui64 = wmsum_value(&arc_sums.arcstat_raw_size); as->arcstat_cached_only_in_progress.value.ui64 = wmsum_value(&arc_sums.arcstat_cached_only_in_progress); as->arcstat_abd_chunk_waste_size.value.ui64 = wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size); return (0); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the ARC eviction * code is laid out; arc_evict_state() assumes ARC buffers are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ static unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; /* * We rely on b_dva to generate evenly distributed index * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given * arc_buf_hdr_t will remain constant throughout its lifetime * (i.e. its b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. In this context full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } static unsigned int arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj) { panic("Header %p insert into arc_l2c_only %p", obj, ml); } #define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \ if ((do_warn) && (tuning) && ((tuning) != (value))) { \ cmn_err(CE_WARN, \ "ignoring tunable %s (using %llu instead)", \ (#tuning), (u_longlong_t)(value)); \ } \ } while (0) /* * Called during module initialization and periodically thereafter to * apply reasonable changes to the exposed performance tunings. Can also be * called explicitly by param_set_arc_*() functions when ARC tunables are * updated manually. Non-zero zfs_* values which differ from the currently set * values will be applied. */ void arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_min <= arc_c_max)) { arc_c_min = zfs_arc_min; arc_c = MAX(arc_c, arc_c_min); } WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose); /* Valid range: 64M - */ if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) && (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = MIN(arc_c, arc_c_max); if (arc_dnode_limit > arc_c_max) arc_dnode_limit = arc_c_max; } WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); /* Valid range: 0 - */ arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100; WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) arc_grow_retry = zfs_arc_grow_retry; /* Valid range: 1 - N */ if (zfs_arc_shrink_shift) { arc_shrink_shift = zfs_arc_shrink_shift; arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } /* Valid range: 1 - N ms */ if (zfs_arc_min_prefetch_ms) arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; /* Valid range: 1 - N ms */ if (zfs_arc_min_prescient_prefetch_ms) { arc_min_prescient_prefetch_ms = zfs_arc_min_prescient_prefetch_ms; } /* Valid range: 0 - 100 */ if (zfs_arc_lotsfree_percent <= 100) arc_lotsfree_percent = zfs_arc_lotsfree_percent; WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent, verbose); /* Valid range: 0 - */ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) arc_sys_free = MIN(zfs_arc_sys_free, allmem); WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose); } static void arc_state_multilist_init(multilist_t *ml, multilist_sublist_index_func_t *index_func, int *maxcountp) { multilist_create(ml, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func); *maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml)); } static void arc_state_init(void) { int num_sublists = 0; arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. Special index function asserts that. */ arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], arc_state_l2c_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], arc_state_l2c_multilist_index_func, &num_sublists); /* * Keep track of the number of markers needed to reclaim buffers from * any ARC state. The markers will be pre-allocated so as to minimize * the number of memory allocations performed by the eviction thread. */ arc_state_evict_marker_count = num_sublists; zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]); zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0); wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0); wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_sums.arcstat_hits, 0); wmsum_init(&arc_sums.arcstat_iohits, 0); wmsum_init(&arc_sums.arcstat_misses, 0); wmsum_init(&arc_sums.arcstat_demand_data_hits, 0); wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_data_misses, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_mru_hits, 0); wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_uncached_hits, 0); wmsum_init(&arc_sums.arcstat_deleted, 0); wmsum_init(&arc_sums.arcstat_mutex_miss, 0); wmsum_init(&arc_sums.arcstat_access_skip, 0); wmsum_init(&arc_sums.arcstat_evict_skip, 0); wmsum_init(&arc_sums.arcstat_evict_not_enough, 0); wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0); wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0); wmsum_init(&arc_sums.arcstat_hash_elements, 0); wmsum_init(&arc_sums.arcstat_hash_collisions, 0); wmsum_init(&arc_sums.arcstat_hash_chains, 0); aggsum_init(&arc_sums.arcstat_size, 0); wmsum_init(&arc_sums.arcstat_compressed_size, 0); wmsum_init(&arc_sums.arcstat_uncompressed_size, 0); wmsum_init(&arc_sums.arcstat_overhead_size, 0); wmsum_init(&arc_sums.arcstat_hdr_size, 0); wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); wmsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0); wmsum_init(&arc_sums.arcstat_l2_feeds, 0); wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0); wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0); wmsum_init(&arc_sums.arcstat_l2_writes_done, 0); wmsum_init(&arc_sums.arcstat_l2_writes_error, 0); wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0); wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0); wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0); wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0); wmsum_init(&arc_sums.arcstat_l2_io_error, 0); wmsum_init(&arc_sums.arcstat_l2_lsize, 0); wmsum_init(&arc_sums.arcstat_l2_psize, 0); aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0); wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0); wmsum_init(&arc_sums.arcstat_memory_direct_count, 0); wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0); wmsum_init(&arc_sums.arcstat_prune, 0); wmsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_raw_size, 0); wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0); wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; arc_mfu->arcs_state = ARC_STATE_MFU; arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; arc_uncached->arcs_state = ARC_STATE_UNCACHED; } static void arc_state_fini(void) { zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]); wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); wmsum_fini(&arc_sums.arcstat_hits); wmsum_fini(&arc_sums.arcstat_iohits); wmsum_fini(&arc_sums.arcstat_misses); wmsum_fini(&arc_sums.arcstat_demand_data_hits); wmsum_fini(&arc_sums.arcstat_demand_data_iohits); wmsum_fini(&arc_sums.arcstat_demand_data_misses); wmsum_fini(&arc_sums.arcstat_demand_metadata_hits); wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits); wmsum_fini(&arc_sums.arcstat_demand_metadata_misses); wmsum_fini(&arc_sums.arcstat_prefetch_data_hits); wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_data_misses); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses); wmsum_fini(&arc_sums.arcstat_mru_hits); wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); wmsum_fini(&arc_sums.arcstat_mfu_hits); wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits); wmsum_fini(&arc_sums.arcstat_uncached_hits); wmsum_fini(&arc_sums.arcstat_deleted); wmsum_fini(&arc_sums.arcstat_mutex_miss); wmsum_fini(&arc_sums.arcstat_access_skip); wmsum_fini(&arc_sums.arcstat_evict_skip); wmsum_fini(&arc_sums.arcstat_evict_not_enough); wmsum_fini(&arc_sums.arcstat_evict_l2_cached); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru); wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible); wmsum_fini(&arc_sums.arcstat_evict_l2_skip); wmsum_fini(&arc_sums.arcstat_hash_elements); wmsum_fini(&arc_sums.arcstat_hash_collisions); wmsum_fini(&arc_sums.arcstat_hash_chains); aggsum_fini(&arc_sums.arcstat_size); wmsum_fini(&arc_sums.arcstat_compressed_size); wmsum_fini(&arc_sums.arcstat_uncompressed_size); wmsum_fini(&arc_sums.arcstat_overhead_size); wmsum_fini(&arc_sums.arcstat_hdr_size); wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); wmsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize); wmsum_fini(&arc_sums.arcstat_l2_mru_asize); wmsum_fini(&arc_sums.arcstat_l2_mfu_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize); wmsum_fini(&arc_sums.arcstat_l2_feeds); wmsum_fini(&arc_sums.arcstat_l2_rw_clash); wmsum_fini(&arc_sums.arcstat_l2_read_bytes); wmsum_fini(&arc_sums.arcstat_l2_write_bytes); wmsum_fini(&arc_sums.arcstat_l2_writes_sent); wmsum_fini(&arc_sums.arcstat_l2_writes_done); wmsum_fini(&arc_sums.arcstat_l2_writes_error); wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_reading); wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached); wmsum_fini(&arc_sums.arcstat_l2_free_on_write); wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_cksum_bad); wmsum_fini(&arc_sums.arcstat_l2_io_error); wmsum_fini(&arc_sums.arcstat_l2_lsize); wmsum_fini(&arc_sums.arcstat_l2_psize); aggsum_fini(&arc_sums.arcstat_l2_hdr_size); wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes); wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize); wmsum_fini(&arc_sums.arcstat_l2_log_blk_count); wmsum_fini(&arc_sums.arcstat_l2_rebuild_success); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_rebuild_size); wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached); wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks); wmsum_fini(&arc_sums.arcstat_memory_throttle_count); wmsum_fini(&arc_sums.arcstat_memory_direct_count); wmsum_fini(&arc_sums.arcstat_memory_indirect_count); wmsum_fini(&arc_sums.arcstat_prune); wmsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); wmsum_fini(&arc_sums.arcstat_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_raw_size); wmsum_fini(&arc_sums.arcstat_cached_only_in_progress); wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size); } uint64_t arc_target_bytes(void) { return (arc_c); } void arc_set_limits(uint64_t allmem) { /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); /* How to set default max varies by platform. */ arc_c_max = arc_default_max(arc_c_min, allmem); } void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t), offsetof(arc_evict_waiter_t, aew_node)); arc_min_prefetch_ms = 1000; arc_min_prescient_prefetch_ms = 6000; #if defined(_KERNEL) arc_lowmem_init(); #endif arc_set_limits(allmem); #ifdef _KERNEL /* * If zfs_arc_max is non-zero at init, meaning it was set in the kernel * environment before the module was loaded, don't block setting the * maximum because it is less than arc_c_min, instead, reset arc_c_min * to a lower value. * zfs_arc_min will be handled by arc_tuning_update(). */ if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX && zfs_arc_max < allmem) { arc_c_max = zfs_arc_max; if (arc_c_min >= arc_c_max) { arc_c_min = MAX(zfs_arc_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); } } #else /* * In userland, there's only the memory pressure that we artificially * create (see arc_available_memory()). Don't let arc_c get too * small, because it can cause transactions to be larger than * arc_c, causing arc_tempreserve_space() to fail. */ arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); #endif arc_c = arc_c_min; /* * 32-bit fixed point fractions of metadata from total ARC size, * MRU data from all data and MRU metadata from all metadata. */ arc_meta = (1ULL << 32) / 4; /* Metadata is 25% of arc_c. */ arc_pd = (1ULL << 32) / 2; /* Data MRU is 50% of data. */ arc_pm = (1ULL << 32) / 2; /* Metadata MRU is 50% of metadata. */ percent = MIN(zfs_arc_dnode_limit_percent, 100); arc_dnode_limit = arc_c_max * percent / 100; /* Apply user specified tunings */ arc_tuning_update(B_TRUE); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; arc_register_hotplug(); arc_state_init(); buf_init(); list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads, defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + list_create(&arc_async_flush_list, sizeof (arc_async_flush_t), + offsetof(arc_async_flush_t, af_node)); + mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL); + arc_flush_taskq = taskq_create("arc_flush", MIN(boot_ncpus, 4), + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } arc_state_evict_markers = arc_state_alloc_markers(arc_state_evict_marker_count); arc_evict_zthr = zthr_create_timer("arc_evict", arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri); arc_reap_zthr = zthr_create_timer("arc_reap", arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri); arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by a module parameter, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4G or 25% of physical memory). */ #ifdef __LP64__ if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #else if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #endif if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = allmem * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } if (zfs_wrlog_data_max == 0) { /* * dp_wrlog_total is reduced for each txg at the end of * spa_sync(). However, dp_dirty_total is reduced every time * a block is written out. Thus under normal operation, * dp_wrlog_total could grow 2 times as big as * zfs_dirty_data_max. */ zfs_wrlog_data_max = zfs_dirty_data_max * 2; } } void arc_fini(void) { arc_prune_t *p; #ifdef _KERNEL arc_lowmem_fini(); #endif /* _KERNEL */ + /* Wait for any background flushes */ + taskq_wait(arc_flush_taskq); + taskq_destroy(arc_flush_taskq); + /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } taskq_wait(arc_prune_taskq); taskq_destroy(arc_prune_taskq); + list_destroy(&arc_async_flush_list); + mutex_destroy(&arc_async_flush_lock); + mutex_enter(&arc_prune_mtx); while ((p = list_remove_head(&arc_prune_list)) != NULL) { (void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } mutex_exit(&arc_prune_mtx); list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); (void) zthr_cancel(arc_evict_zthr); (void) zthr_cancel(arc_reap_zthr); arc_state_free_markers(arc_state_evict_markers, arc_state_evict_marker_count); mutex_destroy(&arc_evict_lock); list_destroy(&arc_evict_waiters); /* * Free any buffers that were tagged for destruction. This needs * to occur before arc_state_fini() runs and destroys the aggsum * values which are updated when freeing scatter ABDs. */ l2arc_do_free_on_write(); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may * trigger the release of kmem magazines, which can callback to * arc_space_return() which accesses aggsums freed in act_state_fini(). */ buf_fini(); arc_state_fini(); arc_unregister_hotplug(); /* * We destroy the zthrs after all the ARC state has been * torn down to avoid the case of them receiving any * wakeup() signals after they are destroyed. */ zthr_destroy(arc_evict_zthr); zthr_destroy(arc_reap_zthr); ASSERT0(arc_loaned_bytes); } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. * * L2ARC persistence: * * When writing buffers to L2ARC, we periodically add some metadata to * make sure we can pick them up after reboot, thus dramatically reducing * the impact that any downtime has on the performance of storage systems * with large caches. * * The implementation works fairly simply by integrating the following two * modifications: * * *) When writing to the L2ARC, we occasionally write a "l2arc log block", * which is an additional piece of metadata which describes what's been * written. This allows us to rebuild the arc_buf_hdr_t structures of the * main ARC buffers. There are 2 linked-lists of log blocks headed by * dh_start_lbps[2]. We alternate which chain we append to, so they are * time-wise and offset-wise interleaved, but that is an optimization rather * than for correctness. The log block also includes a pointer to the * previous block in its chain. * * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device * for our header bookkeeping purposes. This contains a device header, * which contains our top-level reference structures. We update it each * time we write a new log block, so that we're able to locate it in the * L2ARC device. If this write results in an inconsistent device header * (e.g. due to power failure), we detect this by verifying the header's * checksum and simply fail to reconstruct the L2ARC after reboot. * * Implementation diagram: * * +=== L2ARC device (not to scale) ======================================+ * | ___two newest log block pointers__.__________ | * | / \dh_start_lbps[1] | * | / \ \dh_start_lbps[0]| * |.___/__. V V | * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| * || hdr| ^ /^ /^ / / | * |+------+ ...--\-------/ \-----/--\------/ / | * | \--------------/ \--------------/ | * +======================================================================+ * * As can be seen on the diagram, rather than using a simple linked list, * we use a pair of linked lists with alternating elements. This is a * performance enhancement due to the fact that we only find out the * address of the next log block access once the current block has been * completely read in. Obviously, this hurts performance, because we'd be * keeping the device's I/O queue at only a 1 operation deep, thus * incurring a large amount of I/O round-trip latency. Having two lists * allows us to fetch two log blocks ahead of where we are currently * rebuilding L2ARC buffers. * * On-device data structures: * * L2ARC device header: l2arc_dev_hdr_phys_t * L2ARC log block: l2arc_log_blk_phys_t * * L2ARC reconstruction: * * When writing data, we simply write in the standard rotary fashion, * evicting buffers as we go and simply writing new data over them (writing * a new log block every now and then). This obviously means that once we * loop around the end of the device, we will start cutting into an already * committed log block (and its referenced data buffers), like so: * * current write head__ __old tail * \ / * V V * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> * ^ ^^^^^^^^^___________________________________ * | \ * <> may overwrite this blk and/or its bufs --' * * When importing the pool, we detect this situation and use it to stop * our scanning process (see l2arc_rebuild). * * There is one significant caveat to consider when rebuilding ARC contents * from an L2ARC device: what about invalidated buffers? Given the above * construction, we cannot update blocks which we've already written to amend * them to remove buffers which were invalidated. Thus, during reconstruction, * we might be populating the cache with buffers for data that's not on the * main pool anymore, or may have been overwritten! * * As it turns out, this isn't a problem. Every arc_read request includes * both the DVA and, crucially, the birth TXG of the BP the caller is * looking for. So even if the cache were populated by completely rotten * blocks for data that had been long deleted and/or overwritten, we'll * never actually return bad data from the cache, since the DVA with the * birth TXG uniquely identify a block in space and time - once created, * a block is immutable on disk. The worst thing we have done is wasted * some time and memory at l2arc rebuild to reconstruct outdated ARC * entries that will get dropped from the l2arc as it is being updated * with new blocks. * * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write * hand are not restored. This is done by saving the offset (in bytes) * l2arc_evict() has evicted to in the L2ARC device header and taking it * into account when restoring buffers. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); return (B_TRUE); } static uint64_t l2arc_write_size(l2arc_dev_t *dev) { uint64_t size; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, " "resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; /* We need to add in the worst case scenario of log block overhead. */ size += l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { /* * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) * times the writesize, whichever is greater. */ size += MAX(64 * 1024 * 1024, (size * l2arc_trim_ahead) / 100); } /* * Make sure the write size does not exceed the size of the cache * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4); size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift); return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } +static boolean_t +l2arc_dev_invalid(const l2arc_dev_t *dev) +{ + /* + * We want to skip devices that are being rebuilt, trimmed, + * removed, or belong to a spa that is being exported. + */ + return (dev->l2ad_vdev == NULL || vdev_is_dead(dev->l2ad_vdev) || + dev->l2ad_rebuild || dev->l2ad_trim_all || + dev->l2ad_spa == NULL || dev->l2ad_spa->spa_is_exporting); +} + /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; ASSERT3P(next, !=, NULL); - } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting); + } while (l2arc_dev_invalid(next)); /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting) + if (l2arc_dev_invalid(next)) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write(void) { l2arc_data_free_t *df; mutex_enter(&l2arc_free_on_write_mtx); while ((df = list_remove_head(l2arc_free_on_write)) != NULL) { ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_lb_abd_buf_t *abd_buf; l2arc_lb_ptr_buf_t *lb_ptr_buf; l2arc_dev_t *dev; l2arc_dev_hdr_phys_t *l2dhdr; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); /* * All writes completed, or an error was hit. */ top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. We must retry so we * don't leave the ARC_FLAG_L2_WRITING bit set. */ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); /* * We don't want to rescan the headers we've * already marked as having been written out, so * we reinsert the head node so we can pick up * where we left off. */ list_remove(buflist, head); list_insert_after(buflist, hdr, head); mutex_exit(&dev->l2ad_mtx); /* * We wait for the hash lock to become available * to try and prevent busy waiting, and increase * the chance we'll be able to acquire the lock * the next time around. */ mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * We could not have been moved into the arc_l2c_only * state while in-flight due to our ARC_FLAG_L2_WRITING * bit being set. Let's just ensure that's being enforced. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * Skipped - drop L2ARC entry and mark the header as no * longer L2 eligibile. */ if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); uint64_t psize = HDR_GET_PSIZE(hdr); l2arc_hdr_arcstats_decrement(hdr); + ASSERT(dev->l2ad_vdev != NULL); + bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } /* * Free the allocated abd buffers for writing the log blocks. * If the zio failed reclaim the allocated space and remove the * pointers to these log blocks from the log block pointer list * of the L2ARC device. */ while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { abd_free(abd_buf->abd); zio_buf_free(abd_buf, sizeof (*abd_buf)); if (zio->io_error != 0) { lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); /* * L2BLK_GET_PSIZE returns aligned size for log * blocks. */ uint64_t asize = L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); bytes_dropped += asize; ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); (void) zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } list_destroy(&cb->l2wcb_abd_list); if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_writes_error); /* * Restore the lbps array in the header to its previous state. * If the list of log block pointers is empty, zero out the * log block pointers in the device header. */ lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); for (int i = 0; i < 2; i++) { if (lb_ptr_buf == NULL) { /* * If the list is empty zero out the device * header. Otherwise zero out the second log * block pointer in the header. */ if (i == 0) { memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); } else { memset(&l2dhdr->dh_start_lbps[i], 0, sizeof (l2arc_log_blkptr_t)); } break; } memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, lb_ptr_buf); } } ARCSTAT_BUMP(arcstat_l2_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } static int l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) { int ret; spa_t *spa = zio->io_spa; arc_buf_hdr_t *hdr = cb->l2rcb_hdr; blkptr_t *bp = zio->io_bp; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* * ZIL data is never be written to the L2ARC, so we don't need * special handling for its unique MAC storage. */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the data was encrypted, decrypt it now. Note that * we must check the bp here and not the hdr, since the * hdr does not have its encryption parameters updated * until arc_read_done(). */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, HDR_GET_PSIZE(hdr), eabd, hdr->b_l1hdr.b_pabd, &no_crypt); if (ret != 0) { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); goto error; } /* * If we actually performed decryption, replace b_pabd * with the decrypted data. Otherwise we can just throw * our decryption buffer away. */ if (!no_crypt) { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = eabd; zio->io_abd = eabd; } else { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); } } /* * If the L2ARC block was compressed, but ARC compression * is disabled we decompress the data into a new buffer and * replace the existing data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_USE_RESERVE); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); goto error; } arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; zio->io_abd = cabd; zio->io_size = HDR_GET_LSIZE(hdr); } return (0); error: return (ret); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { int tfm_error = 0; l2arc_read_callback_t *cb = zio->io_private; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; boolean_t valid_cksum; boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) && (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT)); ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * If the data was read into a temporary buffer, * move it and free the buffer. */ if (cb->l2rcb_abd != NULL) { ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); if (zio->io_error == 0) { if (using_rdata) { abd_copy(hdr->b_crypt_hdr.b_rabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } else { abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } } /* * The following must be done regardless of whether * there was an error: * - free the temporary buffer * - point zio to the real ARC buffer * - set zio size accordingly * These are required because zio is either re-used for * an I/O of the block in the case of the error * or the zio is passed to arc_read_done() and it * needs real data. */ abd_free(cb->l2rcb_abd); zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); if (using_rdata) { ASSERT(HDR_HAS_RABD(hdr)); zio->io_abd = zio->io_orig_abd = hdr->b_crypt_hdr.b_rabd; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; } } ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ zio->io_prop.zp_complevel = hdr->b_complevel; valid_cksum = arc_cksum_is_equal(hdr, zio); /* * b_rabd will always match the data as it exists on disk if it is * being used. Therefore if we are reading into b_rabd we do not * attempt to untransform the data. */ if (valid_cksum && !using_rdata) tfm_error = l2arc_untransform(zio, cb); if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); } else { /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!valid_cksum || tfm_error != 0) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); void *abd = (using_rdata) ? hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio = zio_read(pio, zio->io_spa, zio->io_bp, abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); /* * Original ZIO will be freed, so we need to update * ARC header with the new ZIO pointer to be used * by zio_change_priority() in arc_read(). */ for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; acb != NULL; acb = acb->acb_next) acb->acb_zio_head = zio; mutex_exit(hash_lock); zio_nowait(zio); } else { mutex_exit(hash_lock); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static multilist_sublist_t * l2arc_sublist_lock(int list_num) { multilist_t *ml = NULL; unsigned int idx; ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES); switch (list_num) { case 0: ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; default: return (NULL); } /* * Return a randomly-selected sublist. This is acceptable * because the caller feeds only a little bit of data for each * call (8MB). Subsequent calls will result in different * sublists being selected. */ idx = multilist_get_random_index(ml); return (multilist_sublist_lock_idx(ml, idx)); } /* * Calculates the maximum overhead of L2ARC metadata log blocks for a given * L2ARC write size. l2arc_evict and l2arc_write_size need to include this * overhead in processing to make sure there is enough headroom available * when writing buffers. */ static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) { if (dev->l2ad_log_entries == 0) { return (0); } else { + ASSERT(dev->l2ad_vdev != NULL); + uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; uint64_t log_blocks = (log_entries + dev->l2ad_log_entries - 1) / dev->l2ad_log_entries; return (vdev_psize_to_asize(dev->l2ad_vdev, sizeof (l2arc_log_blk_phys_t)) * log_blocks); } } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; vdev_t *vd = dev->l2ad_vdev; boolean_t rerun; + ASSERT(vd != NULL || all); + ASSERT(dev->l2ad_spa != NULL || all); + buflist = &dev->l2ad_buflist; top: rerun = B_FALSE; if (dev->l2ad_hand + distance > dev->l2ad_end) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands * to the start and iterate. This iteration does not * happen indefinitely as we make sure in * l2arc_write_size() that when the write hand is reset, * the write size does not exceed the end of the device. */ rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); if (!all) { /* * This check has to be placed after deciding whether to * iterate (rerun). */ if (dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. We have already trimmmed the * whole device. */ goto out; } else { /* * Trim the space to be evicted. */ if (vd->vdev_has_trim && dev->l2ad_evict < taddr && l2arc_trim_ahead > 0) { /* * We have to drop the spa_config lock because * vdev_trim_range() will acquire it. * l2ad_evict already accounts for the label * size. To prevent vdev_trim_ranges() from * adding it again, we subtract it from * l2ad_evict. */ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); vdev_trim_simple(vd, dev->l2ad_evict - VDEV_LABEL_START_SIZE, taddr - dev->l2ad_evict); spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, RW_READER); } /* * When rebuilding L2ARC we retrieve the evict hand * from the header of the device. Of note, l2arc_evict() * does not actually delete buffers from the cache * device, but trimming may do so depending on the * hardware implementation. Thus keeping track of the * evict hand is useful. */ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); } } retry: mutex_enter(&dev->l2ad_mtx); /* * We have to account for evicted log blocks. Run vdev_space_update() * on log blocks whose offset (in bytes) is before the evicted offset * (in bytes) by searching in the list of pointers to log blocks * present in the L2ARC device. */ for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; lb_ptr_buf = lb_ptr_buf_prev) { lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE( (lb_ptr_buf->lb_ptr)->lbp_prop); /* * We don't worry about log blocks left behind (ie * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() * will never write more than l2arc_evict() evicts. */ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { - vdev_space_update(vd, -asize, 0, 0); + if (vd != NULL) + vdev_space_update(vd, -asize, 0, 0); ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); (void) zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); ASSERT(!HDR_EMPTY(hdr)); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto retry; } /* * A header can't be on this list if it doesn't have L2 header. */ ASSERT(HDR_HAS_L2HDR(hdr)); /* Ensure this header has finished being written. */ ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); out: /* * We need to check if we evict all buffers, otherwise we may iterate * unnecessarily. */ if (!all && rerun) { /* * Bump device hand to the device start if it is approaching the * end. l2arc_evict() has already evicted ahead for this case. */ dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; goto top; } if (!all) { /* * In case of cache device removal (all) the following * assertions may be violated without functional consequences * as the device is about to be removed. */ ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end); if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } } /* * Handle any abd transforms that might be required for writing to the L2ARC. * If successful, this function will always return an abd with the data * transformed as it is on disk in a new abd of asize bytes. */ static int l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, abd_t **abd_out) { int ret; abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; enum zio_compress compress = HDR_GET_COMPRESS(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t size = arc_hdr_size(hdr); boolean_t ismd = HDR_ISTYPE_METADATA(hdr); boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); dsl_crypto_key_t *dck = NULL; uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 }; boolean_t no_crypt = B_FALSE; ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) || HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize); ASSERT3U(psize, <=, asize); /* * If this data simply needs its own buffer, we simply allocate it * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ if (HDR_HAS_RABD(hdr)) { ASSERT3U(asize, >, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); abd_zero_off(to_write, psize, asize - psize); goto out; } if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) && !HDR_ENCRYPTED(hdr)) { ASSERT3U(size, ==, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); if (asize > size) abd_zero_off(to_write, size, asize - size); goto out; } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { cabd = abd_alloc_for_io(MAX(size, asize), ismd); uint64_t csize = zio_compress_data(compress, to_write, &cabd, size, MIN(size, psize), hdr->b_complevel); if (csize >= size || csize > psize) { /* * We can't re-compress the block into the original * psize. Even if it fits into asize, it does not * matter, since checksum will never match on read. */ abd_free(cabd); return (SET_ERROR(EIO)); } if (asize > csize) abd_zero_off(cabd, csize, asize - csize); to_write = cabd; } if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); /* * If the dataset was disowned before the buffer * made it to this point, the key to re-encrypt * it won't be available. In this case we simply * won't write the buffer to the L2ARC. */ ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj, FTAG, &dck); if (ret != 0) goto error; ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key, hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) abd_copy(eabd, to_write, psize); if (psize != asize) abd_zero_off(eabd, psize, asize - psize); /* assert that the MAC we got here matches the one we saved */ ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); spa_keystore_dsl_key_rele(spa, dck, FTAG); if (to_write == cabd) abd_free(cabd); to_write = eabd; } out: ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd); *abd_out = to_write; return (0); error: if (dck != NULL) spa_keystore_dsl_key_rele(spa, dck, FTAG); if (cabd != NULL) abd_free(cabd); if (eabd != NULL) abd_free(eabd); *abd_out = NULL; return (ret); } static void l2arc_blk_fetch_done(zio_t *zio) { l2arc_read_callback_t *cb; cb = zio->io_private; if (cb->l2rcb_abd != NULL) abd_free(cb->l2rcb_abd); kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment and the * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *head, *marker; uint64_t write_asize, write_psize, headroom; boolean_t full, from_head = !arc_warm; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); marker = arc_state_alloc_marker(); /* * Copy buffers for L2ARC writing. */ for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) { /* * pass == 0: MFU meta * pass == 1: MRU meta * pass == 2: MFU data * pass == 3: MRU data */ if (l2arc_mfuonly == 1) { if (pass == 1 || pass == 3) continue; } else if (l2arc_mfuonly > 1) { if (pass == 3) continue; } uint64_t passed_sz = 0; headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; /* * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ multilist_sublist_t *mls = l2arc_sublist_lock(pass); ASSERT3P(mls, !=, NULL); if (from_head) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); while (hdr != NULL) { kmutex_t *hash_lock; abd_t *to_write = NULL; hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { skip: /* Skip this buffer rather than waiting. */ if (from_head) hdr = multilist_sublist_next(mls, hdr); else hdr = multilist_sublist_prev(mls, hdr); continue; } passed_sz += HDR_GET_LSIZE(hdr); if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); break; } if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); goto skip; } ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); /* * If the allocated size of this buffer plus the max * size for the pending log block exceeds the evicted * target size, terminate writing buffers for this run. */ if (write_asize + asize + sizeof (l2arc_log_blk_phys_t) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; } /* * We should not sleep with sublist lock held or it * may block ARC eviction. Insert a marker to save * the position and drop the lock. */ if (from_head) { multilist_sublist_insert_after(mls, hdr, marker); } else { multilist_sublist_insert_before(mls, hdr, marker); } multilist_sublist_unlock(mls); /* * If this header has b_rabd, we can use this since it * must always match the data exactly as it exists on * disk. Otherwise, the L2ARC can normally use the * hdr's data, but if we're sharing data between the * hdr and one of its bufs, L2ARC needs its own copy of * the data so that the ZIO below can't race with the * buf consumer. To ensure that this copy will be * available for the lifetime of the ZIO and be cleaned * up afterwards, we add it to the l2arc_free_on_write * queue. If we need to apply any transforms to the * data (compression, encryption) we will also need the * extra buffer. */ if (HDR_HAS_RABD(hdr) && psize == asize) { to_write = hdr->b_crypt_hdr.b_rabd; } else if ((HDR_COMPRESSION_ENABLED(hdr) || HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) && !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) && psize == asize) { to_write = hdr->b_l1hdr.b_pabd; } else { int ret; arc_buf_contents_t type = arc_buf_type(hdr); ret = l2arc_apply_transforms(spa, hdr, asize, &to_write); if (ret != 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); goto next; } l2arc_free_abd_on_write(to_write, asize, type); } hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_arcs_state = hdr->b_l1hdr.b_state->arcs_state; + /* l2arc_hdr_arcstats_update() expects a valid asize */ + HDR_SET_L2SIZE(hdr, asize); arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | ARC_FLAG_L2_WRITING); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_enter(&dev->l2ad_mtx); if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ list_insert_head(&dev->l2ad_buflist, head); } list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); boolean_t commit = l2arc_log_blk_insert(dev, hdr); mutex_exit(hash_lock); if (pio == NULL) { cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); zio_nowait(wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; if (commit) { /* l2ad_hand will be adjusted inside. */ write_asize += l2arc_log_blk_commit(dev, pio, cb); } next: multilist_sublist_lock(mls); if (from_head) hdr = multilist_sublist_next(mls, marker); else hdr = multilist_sublist_prev(mls, marker); multilist_sublist_remove(mls, marker); } multilist_sublist_unlock(mls); if (full == B_TRUE) break; } arc_state_free_marker(marker); /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_psize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); /* * Although we did not write any buffers l2ad_evict may * have advanced. */ if (dev->l2ad_evict != l2dhdr->dh_evict) l2arc_dev_hdr_update(dev); return (0); } if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; /* * Update the device header after the zio completes as * l2arc_write_done() may have updated the memory holding the log block * pointers in the device header. */ l2arc_dev_hdr_update(dev); return (write_asize); } static boolean_t l2arc_hdr_limit_reached(void) { int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size); return (arc_reclaim_needed() || (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ static __attribute__((noreturn)) void l2arc_feed_thread(void *unused) { (void) unused; callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); fstrans_cookie_t cookie; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); cookie = spl_fstrans_mark(); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_idle(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } spl_fstrans_unmark(cookie); l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { return (l2arc_vdev_get(vd) != NULL); } /* * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if * the vdev_t isn't an L2ARC device. */ l2arc_dev_t * l2arc_vdev_get(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev); } static void l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; spa_t *spa = dev->l2ad_spa; + /* + * After a l2arc_remove_vdev(), the spa_t will no longer be valid + */ + if (spa == NULL) + return; + /* * The L2ARC has to hold at least the payload of one log block for * them to be restored (persistent L2ARC). The payload of a log block * depends on the amount of its log entries. We always write log blocks * with 1022 entries. How many of them are committed or restored depends * on the size of the L2ARC device. Thus the maximum payload of * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device * is less than that, we reduce the amount of committed and restored * log entries per block so as to enable persistence. */ if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { dev->l2ad_log_entries = 0; } else { dev->l2ad_log_entries = MIN((dev->l2ad_end - dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, L2ARC_LOG_BLK_MAX_ENTRIES); } /* * Read the device header, if an error is returned do not rebuild L2ARC. */ if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, * we should evict all ARC buffers and pointers to log blocks * and reclaim their space before restoring its contents to * L2ARC. */ if (reopen) { if (!l2arc_rebuild_enabled) { return; } else { l2arc_evict(dev, 0, B_TRUE); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; } } /* * Just mark the device as pending for a rebuild. We won't * be starting a rebuild in line here as it would block pool * import. Instead spa_load_impl will hand that off to an * async task which will call l2arc_spa_rebuild_start. */ dev->l2ad_rebuild = B_TRUE; } else if (spa_writeable(spa)) { /* * In this case TRIM the whole device if l2arc_trim_ahead > 0, * otherwise create a new header. We zero out the memory holding * the header to reset dh_start_lbps. If we TRIM the whole * device the new header will be written by * vdev_trim_l2arc_thread() at the end of the TRIM to update the * trim_state in the header too. When reading the header, if * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 * we opt to TRIM the whole device again. */ if (l2arc_trim_ahead > 0) { dev->l2ad_trim_all = B_TRUE; } else { memset(l2dhdr, 0, l2dhdr_asize); l2arc_dev_hdr_update(dev); } } } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); /* * Create a new l2arc device entry. */ adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; /* leave extra size for an l2arc device header */ l2dhdr_asize = adddev->l2ad_dev_hdr_asize = MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; adddev->l2ad_trim_all = B_FALSE; list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); /* * This is a list of pointers to log blocks that are still present * on the device. */ list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), offsetof(l2arc_lb_ptr_buf_t, node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); zfs_refcount_create(&adddev->l2ad_lb_asize); zfs_refcount_create(&adddev->l2ad_lb_count); /* * Decide if dev is eligible for L2ARC rebuild or whole device * trimming. This has to happen before the device is added in the * cache device list and l2arc_dev_mtx is released. Otherwise * l2arc_feed_thread() might already start writing on the * device. */ l2arc_rebuild_dev(adddev, B_FALSE); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen() * in case of onlining a cache device. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) { l2arc_dev_t *dev = NULL; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); /* * In contrast to l2arc_add_vdev() we do not have to worry about * l2arc_feed_thread() invalidating previous content when onlining a * cache device. The device parameters (l2ad*) are not cleared when * offlining the device and writing new buffers will not invalidate * all previous content. In worst case only buffers that have not had * their log block written to the device will be lost. * When onlining the cache device (ie offline->online without exporting * the pool in between) this happens: * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev() * | | * vdev_is_dead() = B_FALSE l2ad_rebuild = B_TRUE * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild * is set to B_TRUE we might write additional buffers to the device. */ l2arc_rebuild_dev(dev, reopen); } +typedef struct { + l2arc_dev_t *rva_l2arc_dev; + uint64_t rva_spa_gid; + uint64_t rva_vdev_gid; + boolean_t rva_async; + +} remove_vdev_args_t; + +static void +l2arc_device_teardown(void *arg) +{ + remove_vdev_args_t *rva = arg; + l2arc_dev_t *remdev = rva->rva_l2arc_dev; + hrtime_t start_time = gethrtime(); + + /* + * Clear all buflists and ARC references. L2ARC device flush. + */ + l2arc_evict(remdev, 0, B_TRUE); + list_destroy(&remdev->l2ad_buflist); + ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); + list_destroy(&remdev->l2ad_lbptr_list); + mutex_destroy(&remdev->l2ad_mtx); + zfs_refcount_destroy(&remdev->l2ad_alloc); + zfs_refcount_destroy(&remdev->l2ad_lb_asize); + zfs_refcount_destroy(&remdev->l2ad_lb_count); + kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); + vmem_free(remdev, sizeof (l2arc_dev_t)); + + uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time); + if (elaspsed > 0) { + zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms", + (u_longlong_t)rva->rva_spa_gid, + (u_longlong_t)rva->rva_vdev_gid, + (u_longlong_t)elaspsed); + } + + if (rva->rva_async) + arc_async_flush_remove(rva->rva_spa_gid, 2); + kmem_free(rva, sizeof (remove_vdev_args_t)); +} + /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { - l2arc_dev_t *remdev = NULL; + spa_t *spa = vd->vdev_spa; + boolean_t asynchronous = spa->spa_state == POOL_STATE_EXPORTED || + spa->spa_state == POOL_STATE_DESTROYED; /* * Find the device by vdev */ - remdev = l2arc_vdev_get(vd); + l2arc_dev_t *remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); + /* + * Save info for final teardown + */ + remove_vdev_args_t *rva = kmem_alloc(sizeof (remove_vdev_args_t), + KM_SLEEP); + rva->rva_l2arc_dev = remdev; + rva->rva_spa_gid = spa_load_guid(spa); + rva->rva_vdev_gid = remdev->l2ad_vdev->vdev_guid; + /* * Cancel any ongoing or scheduled rebuild. */ mutex_enter(&l2arc_rebuild_thr_lock); + remdev->l2ad_rebuild_cancel = B_TRUE; if (remdev->l2ad_rebuild_began == B_TRUE) { - remdev->l2ad_rebuild_cancel = B_TRUE; while (remdev->l2ad_rebuild == B_TRUE) cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); } mutex_exit(&l2arc_rebuild_thr_lock); + rva->rva_async = asynchronous; /* * Remove device from global list */ + ASSERT(spa_config_held(spa, SCL_L2ARC, RW_WRITER) & SCL_L2ARC); mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); + + /* During a pool export spa & vdev will no longer be valid */ + if (asynchronous) { + remdev->l2ad_spa = NULL; + remdev->l2ad_vdev = NULL; + } mutex_exit(&l2arc_dev_mtx); - /* - * Clear all buflists and ARC references. L2ARC device flush. - */ - l2arc_evict(remdev, 0, B_TRUE); - list_destroy(&remdev->l2ad_buflist); - ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); - list_destroy(&remdev->l2ad_lbptr_list); - mutex_destroy(&remdev->l2ad_mtx); - zfs_refcount_destroy(&remdev->l2ad_alloc); - zfs_refcount_destroy(&remdev->l2ad_lb_asize); - zfs_refcount_destroy(&remdev->l2ad_lb_count); - kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); - vmem_free(remdev, sizeof (l2arc_dev_t)); + if (!asynchronous) { + l2arc_device_teardown(rva); + return; + } + + arc_async_flush_t *af = arc_async_flush_add(rva->rva_spa_gid, 2); + + taskq_dispatch_ent(arc_flush_taskq, l2arc_device_teardown, rva, + TQ_SLEEP, &af->af_tqent); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_rebuild_thr_lock); cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, defclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } /* * Punches out rebuild threads for the L2ARC devices in a spa. This should * be called after pool import from the spa async thread, since starting * these threads directly from spa_import() will make them part of the * "zpool import" context and delay process exit (and thus pool import). */ void l2arc_spa_rebuild_start(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Locate the spa's l2arc devices and kick off rebuild threads. */ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { l2arc_dev_t *dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); if (dev == NULL) { /* Don't attempt a rebuild if the vdev is UNAVAIL */ continue; } mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild_began = B_TRUE; (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread, dev, 0, &p0, TS_RUN, minclsyspri); } mutex_exit(&l2arc_rebuild_thr_lock); } } void l2arc_spa_rebuild_stop(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_export_thread == curthread); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { l2arc_dev_t *dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); if (dev == NULL) continue; mutex_enter(&l2arc_rebuild_thr_lock); dev->l2ad_rebuild_cancel = B_TRUE; mutex_exit(&l2arc_rebuild_thr_lock); } for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { l2arc_dev_t *dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); if (dev == NULL) continue; mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_began == B_TRUE) { while (dev->l2ad_rebuild == B_TRUE) { cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); } } mutex_exit(&l2arc_rebuild_thr_lock); } } /* * Main entry point for L2ARC rebuilding. */ static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg) { l2arc_dev_t *dev = arg; VERIFY(dev->l2ad_rebuild); (void) l2arc_rebuild(dev); mutex_enter(&l2arc_rebuild_thr_lock); dev->l2ad_rebuild_began = B_FALSE; dev->l2ad_rebuild = B_FALSE; cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); thread_exit(); } /* * This function implements the actual L2ARC metadata rebuild. It: * starts reading the log block chain and restores each block's contents * to memory (reconstructing arc_buf_hdr_t's). * * Operation stops under any of the following conditions: * * 1) We reach the end of the log block chain. * 2) We encounter *any* error condition (cksum errors, io errors) */ static int l2arc_rebuild(l2arc_dev_t *dev) { vdev_t *vd = dev->l2ad_vdev; spa_t *spa = vd->vdev_spa; int err = 0; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; l2arc_log_blk_phys_t *this_lb, *next_lb; zio_t *this_io = NULL, *next_io = NULL; l2arc_log_blkptr_t lbps[2]; l2arc_lb_ptr_buf_t *lb_ptr_buf; boolean_t lock_held; this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP); next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP); /* * We prevent device removal while issuing reads to the device, * then during the rebuilding phases we drop this lock again so * that a spa_unload or device remove can be initiated - this is * safe, because the spa will signal us to stop before removing * our device and wait for us to stop. */ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); lock_held = B_TRUE; /* * Retrieve the persistent L2ARC device state. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), dev->l2ad_start); dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; vd->vdev_trim_state = l2dhdr->dh_trim_state; /* * In case the zfs module parameter l2arc_rebuild_enabled is false * we do not start the rebuild process. */ if (!l2arc_rebuild_enabled) goto out; /* Prepare the rebuild process */ memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps)); /* Start the rebuild process */ for (;;) { if (!l2arc_log_blkptr_valid(dev, &lbps[0])) break; if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], this_lb, next_lb, this_io, &next_io)) != 0) goto out; /* * Our memory pressure valve. If the system is running low * on memory, rather than swamping memory with new ARC buf * hdrs, we opt not to rebuild the L2ARC. At this point, * however, we have already set up our L2ARC dev to chain in * new metadata log blocks, so the user may choose to offline/ * online the L2ARC dev at a later time (or re-import the pool) * to reconstruct it (when there's less memory pressure). */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); cmn_err(CE_NOTE, "System running low on memory, " "aborting L2ARC rebuild."); err = SET_ERROR(ENOMEM); goto out; } spa_config_exit(spa, SCL_L2ARC, vd); lock_held = B_FALSE; /* * Now that we know that the next_lb checks out alright, we * can start reconstruction from this log block. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); l2arc_log_blk_restore(dev, this_lb, asize); /* * log block restored, include its pointer in the list of * pointers to log blocks present in the L2ARC device. */ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); memcpy(lb_ptr_buf->lb_ptr, &lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(vd, asize, 0, 0); /* * Protection against loops of log blocks: * * l2ad_hand l2ad_evict * V V * l2ad_start |=======================================| l2ad_end * -----|||----|||---|||----||| * (3) (2) (1) (0) * ---|||---|||----|||---||| * (7) (6) (5) (4) * * In this situation the pointer of log block (4) passes * l2arc_log_blkptr_valid() but the log block should not be * restored as it is overwritten by the payload of log block * (0). Only log blocks (0)-(3) should be restored. We check * whether l2ad_evict lies in between the payload starting * offset of the next log block (lbps[1].lbp_payload_start) * and the payload starting offset of the present log block * (lbps[0].lbp_payload_start). If true and this isn't the * first pass, we are looping from the beginning and we should * stop. */ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, lbps[0].lbp_payload_start, dev->l2ad_evict) && !dev->l2ad_first) goto out; kpreempt(KPREEMPT_SYNC); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { mutex_exit(&l2arc_rebuild_thr_lock); err = SET_ERROR(ECANCELED); goto out; } mutex_exit(&l2arc_rebuild_thr_lock); if (spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) { lock_held = B_TRUE; break; } /* * L2ARC config lock held by somebody in writer, * possibly due to them trying to remove us. They'll * likely to want us to shut down, so after a little * delay, we check l2ad_rebuild_cancel and retry * the lock again. */ delay(1); } /* * Continue with the next log block. */ lbps[0] = lbps[1]; lbps[1] = this_lb->lb_prev_lbp; PTR_SWAP(this_lb, next_lb); this_io = next_io; next_io = NULL; } if (this_io != NULL) l2arc_log_blk_fetch_abort(this_io); out: if (next_io != NULL) l2arc_log_blk_fetch_abort(next_io); vmem_free(this_lb, sizeof (*this_lb)); vmem_free(next_lb, sizeof (*next_lb)); - if (!l2arc_rebuild_enabled) { + if (err == ECANCELED) { + /* + * In case the rebuild was canceled do not log to spa history + * log as the pool may be in the process of being removed. + */ + zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + return (err); + } else if (!l2arc_rebuild_enabled) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "disabled"); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_success); spa_history_log_internal(spa, "L2ARC rebuild", NULL, "successful, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { /* * No error but also nothing restored, meaning the lbps array * in the device header points to invalid/non-present log * blocks. Reset the header. */ spa_history_log_internal(spa, "L2ARC rebuild", NULL, "no valid log blocks"); memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); - } else if (err == ECANCELED) { - /* - * In case the rebuild was canceled do not log to spa history - * log as the pool may be in the process of being removed. - */ - zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", - (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err != 0) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } if (lock_held) spa_config_exit(spa, SCL_L2ARC, vd); return (err); } /* * Attempts to read the device header on the provided L2ARC device and writes * it to `hdr'. On success, this function returns 0, otherwise the appropriate * error code is returned. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev) { int err; uint64_t guid; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; guid = spa_guid(dev->l2ad_vdev->vdev_spa); abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); if (err != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); return (err); } if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || l2dhdr->dh_spa_guid != guid || l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, l2dhdr->dh_evict) || (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && l2arc_trim_ahead > 0)) { /* * Attempt to rebuild a device containing no actual dev hdr * or containing a header from some other pool or from another * version of persistent L2ARC. */ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); return (SET_ERROR(ENOTSUP)); } return (0); } /* * Reads L2ARC log blocks from storage and validates their contents. * * This function implements a simple fetcher to make sure that while * we're processing one buffer the L2ARC is already fetching the next * one in the chain. * * The arguments this_lp and next_lp point to the current and next log block * address in the block chain. Similarly, this_lb and next_lb hold the * l2arc_log_blk_phys_t's of the current and next L2ARC blk. * * The `this_io' and `next_io' arguments are used for block fetching. * When issuing the first blk IO during rebuild, you should pass NULL for * `this_io'. This function will then issue a sync IO to read the block and * also issue an async IO to fetch the next block in the block chain. The * fetched IO is returned in `next_io'. On subsequent calls to this * function, pass the value returned in `next_io' from the previous call * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. * Prior to the call, you should initialize your `next_io' pointer to be * NULL. If no fetch IO was issued, the pointer is left set at NULL. * * On success, this function returns 0, otherwise it returns an appropriate * error code. On error the fetching IO is aborted and cleared before * returning from this function. Therefore, if we return `success', the * caller can assume that we have taken care of cleanup of fetch IOs. */ static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io) { int err = 0; zio_cksum_t cksum; uint64_t asize; ASSERT(this_lbp != NULL && next_lbp != NULL); ASSERT(this_lb != NULL && next_lb != NULL); ASSERT(next_io != NULL && *next_io == NULL); ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); /* * Check to see if we have issued the IO for this log block in a * previous run. If not, this is the first call, so issue it now. */ if (this_io == NULL) { this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, this_lb); } /* * Peek to see if we can start issuing the next IO immediately. */ if (l2arc_log_blkptr_valid(dev, next_lbp)) { /* * Start issuing IO for the next log block early - this * should help keep the L2ARC device busy while we * decompress and restore this log block. */ *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, next_lb); } /* Wait for the IO to read this log block to complete */ if ((err = zio_wait(this_io)) != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " "offset: %llu, vdev guid: %llu", err, (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid); goto cleanup; } /* * Make sure the buffer checks out. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); fletcher_4_native(this_lb, asize, NULL, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid, (u_longlong_t)dev->l2ad_hand, (u_longlong_t)dev->l2ad_evict); err = SET_ERROR(ECKSUM); goto cleanup; } /* Now we can take our time decoding this buffer */ switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: { abd_t *abd = abd_alloc_linear(asize, B_TRUE); abd_copy_from_buf_off(abd, this_lb, 0, asize); abd_t dabd; abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb)); err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), abd, &dabd, asize, sizeof (*this_lb), NULL); abd_free(&dabd); abd_free(abd); if (err != 0) { err = SET_ERROR(EINVAL); goto cleanup; } break; } default: err = SET_ERROR(EINVAL); goto cleanup; } if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(this_lb, sizeof (*this_lb)); if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { err = SET_ERROR(EINVAL); goto cleanup; } cleanup: /* Abort an in-flight fetch I/O in case of error */ if (err != 0 && *next_io != NULL) { l2arc_log_blk_fetch_abort(*next_io); *next_io = NULL; } return (err); } /* * Restores the payload of a log block to ARC. This creates empty ARC hdr * entries which only contain an l2arc hdr, essentially restoring the * buffers to their L2ARC evicted state. This function also updates space * usage on the L2ARC vdev to make sure it tracks restored buffers. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize) { uint64_t size = 0, asize = 0; uint64_t log_entries = dev->l2ad_log_entries; /* * Usually arc_adapt() is called only for data, not headers, but * since we may allocate significant amount of memory here, let ARC * grow its arc_c. */ arc_adapt(log_entries * HDR_L2ONLY_SIZE); for (int i = log_entries - 1; i >= 0; i--) { /* * Restore goes in the reverse temporal direction to preserve * correct temporal ordering of buffers in the l2ad_buflist. * l2arc_hdr_restore also does a list_insert_tail instead of * list_insert_head on the l2ad_buflist: * * LIST l2ad_buflist LIST * HEAD <------ (time) ------ TAIL * direction +-----+-----+-----+-----+-----+ direction * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild * fill +-----+-----+-----+-----+-----+ * ^ ^ * | | * | | * l2arc_feed_thread l2arc_rebuild * will place new bufs here restores bufs here * * During l2arc_rebuild() the device is not used by * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. */ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); asize += vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); l2arc_hdr_restore(&lb->lb_entries[i], dev); } /* * Record rebuild stats: * size Logical size of restored buffers in the L2ARC * asize Aligned size of restored buffers in the L2ARC */ ARCSTAT_INCR(arcstat_l2_rebuild_size, size); ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); } /* * Restores a single ARC buf hdr from a log entry. The ARC buffer is put * into a state indicating that it has been evicted to L2ARC. */ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) { arc_buf_hdr_t *hdr, *exists; kmutex_t *hash_lock; arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); - uint64_t asize; + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((le)->le_prop)); /* * Do all the allocation before grabbing any locks, this lets us * sleep if memory is full and we don't have to deal with failed * allocations. */ hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, dev, le->le_dva, le->le_daddr, - L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, + L2BLK_GET_PSIZE((le)->le_prop), asize, le->le_birth, L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, L2BLK_GET_PROTECTED((le)->le_prop), L2BLK_GET_PREFETCH((le)->le_prop), L2BLK_GET_STATE((le)->le_prop)); - asize = vdev_psize_to_asize(dev->l2ad_vdev, - L2BLK_GET_PSIZE((le)->le_prop)); /* * vdev_space_update() has to be called before arc_hdr_destroy() to * avoid underflow since the latter also calls vdev_space_update(). */ l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); mutex_exit(&dev->l2ad_mtx); exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* Buffer was already cached, no need to restore it. */ arc_hdr_destroy(hdr); /* * If the buffer is already cached, check whether it has * L2ARC metadata. If not, enter them and update the flag. * This is important is case of onlining a cache device, since * we previously evicted all L2ARC metadata from ARC. */ if (!HDR_HAS_L2HDR(exists)) { arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); exists->b_l2hdr.b_dev = dev; exists->b_l2hdr.b_daddr = le->le_daddr; exists->b_l2hdr.b_arcs_state = L2BLK_GET_STATE((le)->le_prop); + /* l2arc_hdr_arcstats_update() expects a valid asize */ + HDR_SET_L2SIZE(exists, asize); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, exists); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(exists), exists); mutex_exit(&dev->l2ad_mtx); l2arc_hdr_arcstats_increment(exists); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); } ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); } mutex_exit(hash_lock); } /* * Starts an asynchronous read IO to read a log block. This is used in log * block reconstruction to start reading the next block before we are done * decoding and reconstructing the current block, to keep the l2arc device * nice and hot with read IO to process. * The returned zio will contain a newly allocated memory buffers for the IO * data which should then be freed by the caller once the zio is no longer * needed (i.e. due to it having completed). If you wish to abort this * zio, you should do so using l2arc_log_blk_fetch_abort, which takes * care of disposing of the allocated buffers correctly. */ static zio_t * l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, l2arc_log_blk_phys_t *lb) { uint32_t asize; zio_t *pio; l2arc_read_callback_t *cb; /* L2BLK_GET_PSIZE returns aligned size for log blocks */ asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); } /* * Aborts a zio returned from l2arc_log_blk_fetch and frees the data * buffers allocated for it. */ static void l2arc_log_blk_fetch_abort(zio_t *zio) { (void) zio_wait(zio); } /* * Creates a zio to update the device header on an l2arc device. */ void l2arc_dev_hdr_update(l2arc_dev_t *dev) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; int err; VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; l2dhdr->dh_log_entries = dev->l2ad_log_entries; l2dhdr->dh_evict = dev->l2ad_evict; l2dhdr->dh_start = dev->l2ad_start; l2dhdr->dh_end = dev->l2ad_end; l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); abd_free(abd); if (err != 0) { zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); } } /* * Commits a log block to the L2ARC device. This routine is invoked from * l2arc_write_buffers when the log block fills up. * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t psize, asize; zio_t *wzio; l2arc_lb_abd_buf_t *abd_buf; abd_t *abd = NULL; l2arc_lb_ptr_buf_t *lb_ptr_buf; VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); abd_buf = zio_buf_alloc(sizeof (*abd_buf)); abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); /* link the buffer into the block chain */ lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; lb->lb_magic = L2ARC_LOG_BLK_MAGIC; /* * l2arc_log_blk_commit() may be called multiple times during a single * l2arc_write_buffers() call. Save the allocated abd buffers in a list * so we can free them in l2arc_write_done() later on. */ list_insert_tail(&cb->l2wcb_abd_list, abd_buf); /* try to compress the buffer, at least one sector to save */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, abd_buf->abd, &abd, sizeof (*lb), zio_get_compression_max_size(ZIO_COMPRESS_LZ4, dev->l2ad_vdev->vdev_ashift, dev->l2ad_vdev->vdev_ashift, sizeof (*lb)), 0); /* a log block is never entirely zero */ ASSERT(psize != 0); asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(asize <= sizeof (*lb)); /* * Update the start log block pointer in the device header to point * to the log block we're about to write. */ l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; l2dhdr->dh_start_lbps[0].lbp_payload_asize = dev->l2ad_log_blk_payload_asize; l2dhdr->dh_start_lbps[0].lbp_payload_start = dev->l2ad_log_blk_payload_start; L2BLK_SET_LSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); L2BLK_SET_PSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); L2BLK_SET_CHECKSUM( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_CHECKSUM_FLETCHER_4); if (asize < sizeof (*lb)) { /* compression succeeded */ abd_zero_off(abd, psize, asize - psize); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_LZ4); } else { /* compression failed */ abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb)); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_OFF); } /* checksum what we're about to write */ abd_fletcher_4_native(abd, asize, NULL, &l2dhdr->dh_start_lbps[0].lbp_cksum); abd_free(abd_buf->abd); /* perform the write itself */ abd_buf->abd = abd; wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); (void) zio_nowait(wzio); dev->l2ad_hand += asize; vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. */ memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_writes); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, dev->l2ad_log_blk_payload_asize / asize); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; return (asize); } /* * Validates an L2ARC log block address to make sure that it can be read * from the provided L2ARC device. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) { /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); uint64_t end = lbp->lbp_daddr + asize - 1; uint64_t start = lbp->lbp_payload_start; boolean_t evicted = B_FALSE; /* * A log block is valid if all of the following conditions are true: * - it fits entirely (including its payload) between l2ad_start and * l2ad_end * - it has a valid size * - neither the log block itself nor part of its payload was evicted * by l2arc_evict(): * * l2ad_hand l2ad_evict * | | lbp_daddr * | start | | end * | | | | | * V V V V V * l2ad_start ============================================ l2ad_end * --------------------------|||| * ^ ^ * | log block * payload */ evicted = l2arc_range_check_overlap(start, end, dev->l2ad_hand) || l2arc_range_check_overlap(start, end, dev->l2ad_evict) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); return (start >= dev->l2ad_start && end <= dev->l2ad_end && asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && (!evicted || dev->l2ad_first)); } /* * Inserts ARC buffer header `hdr' into the current L2ARC log block on * the device. The buffer being inserted must be present in L2ARC. * Returns B_TRUE if the L2ARC log block is full and needs to be committed * to L2ARC, or B_FALSE if it still has room for more ARC buffers. */ static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_log_ent_phys_t *le; if (dev->l2ad_log_entries == 0) return (B_FALSE); int index = dev->l2ad_log_ent_idx++; ASSERT3S(index, <, dev->l2ad_log_entries); ASSERT(HDR_HAS_L2HDR(hdr)); le = &lb->lb_entries[index]; memset(le, 0, sizeof (*le)); le->le_dva = hdr->b_dva; le->le_birth = hdr->b_birth; le->le_daddr = hdr->b_l2hdr.b_daddr; if (index == 0) dev->l2ad_log_blk_payload_start = le->le_daddr; L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); le->le_complevel = hdr->b_complevel; L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); } /* * Checks whether a given L2ARC device address sits in a time-sequential * range. The trick here is that the L2ARC is a rotary buffer, so we can't * just do a range comparison, we need to handle the situation in which the * range wraps around the end of the L2ARC device. Arguments: * bottom -- Lower end of the range to check (written to earlier). * top -- Upper end of the range to check (written to later). * check -- The address for which we want to determine if it sits in * between the top and bottom. * * The 3-way conditional below represents the following cases: * * bottom < top : Sequentially ordered case: * --------+-------------------+ * | (overlap here?) | * L2ARC dev V V * |---------------============--------------| * * bottom > top: Looped-around case: * --------+------------------+ * | (overlap here?) | * L2ARC dev V V * |===============---------------===========| * ^ ^ * | (or here?) | * +---------------+--------- * * top == bottom : Just a single address comparison. */ boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) { if (bottom < top) return (bottom <= check && check <= top); else if (bottom > top) return (check <= top || bottom <= check); else return (check == top); } EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW, "Balance between metadata and data on ghost hits."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, param_get_uint, ZMOD_RW, "Seconds before growing ARC size"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim ARC to"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD, "Target average block size"); ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, "Disable compressed ARC buffers"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prefetch block in ms"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prescient prefetched block in ms"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW, "Max write bytes per interval"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW, "Extra write bytes during device warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW, "Number of max device writes to precache"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW, "Compressed l2arc_headroom multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW, "TRIM ahead L2ARC write size multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW, "Seconds between L2ARC writing"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW, "Min feed interval in milliseconds"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW, "Skip caching prefetched buffers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW, "Turbo L2ARC warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW, "No reads during writes"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW, "Percent of ARC size allowed for L2ARC-only headers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, "Rebuild the L2ARC when importing a pool"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW, "Min size in bytes to write rebuild log blocks in L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, "Cache only MFU data from ARC into L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, "Exclude dbufs on special vdevs from being cached to L2ARC if set."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "Percent of ARC meta buffers for dnodes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW, "When full, ARC allocation waits for eviction of this % of alloc size"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW, "The number of headers to evict per sublist before moving to the next"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW, "Number of arc_prune threads"); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 5ae96882935c..39f97d7547c6 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -1,1487 +1,1495 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ZFS Write Throttle * ------------------ * * ZFS must limit the rate of incoming writes to the rate at which it is able * to sync data modifications to the backend storage. Throttling by too much * creates an artificial limit; throttling by too little can only be sustained * for short periods and would lead to highly lumpy performance. On a per-pool * basis, ZFS tracks the amount of modified (dirty) data. As operations change * data, the amount of dirty data increases; as ZFS syncs out data, the amount * of dirty data decreases. When the amount of dirty data exceeds a * predetermined threshold further modifications are blocked until the amount * of dirty data decreases (as data is synced out). * * The limit on dirty data is tunable, and should be adjusted according to * both the IO capacity and available memory of the system. The larger the * window, the more ZFS is able to aggregate and amortize metadata (and data) * changes. However, memory is a limited resource, and allowing for more dirty * data comes at the cost of keeping other useful data in memory (for example * ZFS data cached by the ARC). * * Implementation * * As buffers are modified dsl_pool_willuse_space() increments both the per- * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of * dirty space used; dsl_pool_dirty_space() decrements those values as data * is synced out from dsl_pool_sync(). While only the poolwide value is * relevant, the per-txg value is useful for debugging. The tunable * zfs_dirty_data_max determines the dirty space limit. Once that value is * exceeded, new writes are halted until space frees up. * * The zfs_dirty_data_sync_percent tunable dictates the threshold at which we * ensure that there is a txg syncing (see the comment in txg.c for a full * description of transaction group stages). * * The IO scheduler uses both the dirty space limit and current amount of * dirty data as inputs. Those values affect the number of concurrent IOs ZFS * issues. See the comment in vdev_queue.c for details of the IO scheduler. * * The delay is also calculated based on the amount of dirty data. See the * comment above dmu_tx_delay() for details. */ /* * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, * capped at zfs_dirty_data_max_max. It can also be overridden with a module * parameter. */ uint64_t zfs_dirty_data_max = 0; uint64_t zfs_dirty_data_max_max = 0; uint_t zfs_dirty_data_max_percent = 10; uint_t zfs_dirty_data_max_max_percent = 25; /* * The upper limit of TX_WRITE log data. Write operations are throttled * when approaching the limit until log data is cleared out after txg sync. * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. */ uint64_t zfs_wrlog_data_max = 0; /* * If there's at least this much dirty data (as a percentage of * zfs_dirty_data_max), push out a txg. This should be less than * zfs_vdev_async_write_active_min_dirty_percent. */ static uint_t zfs_dirty_data_sync_percent = 20; /* * Once there is this amount of dirty data, the dmu_tx_delay() will kick in * and delay each transaction. * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. */ uint_t zfs_delay_min_dirty_percent = 60; /* * This controls how quickly the delay approaches infinity. * Larger values cause it to delay more for a given amount of dirty data. * Therefore larger values will cause there to be less dirty data for a * given throughput. * * For the smoothest delay, this value should be about 1 billion divided * by the maximum number of operations per second. This will smoothly * handle between 10x and 1/10th this number. * * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the * multiply in dmu_tx_delay(). */ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; /* * These tunables determine the behavior of how zil_itxg_clean() is * called via zil_clean() in the context of spa_sync(). When an itxg * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching. * If the dispatch fails, the call to zil_itxg_clean() will occur * synchronously in the context of spa_sync(), which can negatively * impact the performance of spa_sync() (e.g. in the case of the itxg * list having a large number of itxs that needs to be cleaned). * * Thus, these tunables can be used to manipulate the behavior of the * taskq used by zil_clean(); they determine the number of taskq entries * that are pre-populated when the taskq is first created (via the * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of * taskq entries that are cached after an on-demand allocation (via the * "zfs_zil_clean_taskq_maxalloc"). * * The idea being, we want to try reasonably hard to ensure there will * already be a taskq entry pre-allocated by the time that it is needed * by zil_clean(). This way, we can avoid the possibility of an * on-demand allocation of a new taskq entry from failing, which would * result in zil_itxg_clean() being called synchronously from zil_clean() * (which can adversely affect performance of spa_sync()). * * Additionally, the number of threads used by the taskq can be * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable. */ static int zfs_zil_clean_taskq_nthr_pct = 100; static int zfs_zil_clean_taskq_minalloc = 1024; static int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; int err; err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, name, sizeof (obj), 1, &obj); if (err) return (err); return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); } static dsl_pool_t * dsl_pool_open_impl(spa_t *spa, uint64_t txg) { dsl_pool_t *dp; blkptr_t *bp = spa_get_rootblkptr(spa); dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); dp->dp_spa = spa; dp->dp_meta_rootbp = *bp; rrw_init(&dp->dp_config_rwlock, B_TRUE); txg_init(dp, txg); mmp_init(spa); txg_list_create(&dp->dp_dirty_datasets, spa, offsetof(dsl_dataset_t, ds_dirty_link)); txg_list_create(&dp->dp_dirty_zilogs, spa, offsetof(zilog_t, zl_dirty_link)); txg_list_create(&dp->dp_dirty_dirs, spa, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); txg_list_create(&dp->dp_early_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq"); dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", zfs_zil_clean_taskq_nthr_pct, minclsyspri, zfs_zil_clean_taskq_minalloc, zfs_zil_clean_taskq_maxalloc, TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); aggsum_init(&dp->dp_wrlog_total, 0); for (int i = 0; i < TXG_SIZE; i++) { aggsum_init(&dp->dp_wrlog_pertxg[i], 0); } dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain", 100, defclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); return (dp); } int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); /* * Initialize the caller's dsl_pool_t structure before we actually open * the meta objset. This is done because a self-healing write zio may * be issued as part of dmu_objset_open_impl() and the spa needs its * dsl_pool_t initialized in order to handle the write. */ *dpp = dp; err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &dp->dp_meta_objset); if (err != 0) { dsl_pool_close(dp); *dpp = NULL; } return (err); } int dsl_pool_open(dsl_pool_t *dp) { int err; dsl_dir_t *dd; dsl_dataset_t *ds; uint64_t obj; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); if (err) goto out; err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir); if (err) goto out; err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); if (err) goto out; if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; err = dsl_dataset_hold_obj(dp, dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); if (err == 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap); dsl_dataset_rele(ds, FTAG); } dsl_dir_rele(dd, dp); if (err) goto out; } if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir); if (err) goto out; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err) goto out; VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err == 0) { VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); } else if (err == ENOENT) { /* * We might not have created the remap bpobj yet. */ } else { goto out; } } /* * Note: errors ignored, because the these special dirs, used for * space accounting, are only created on demand. */ (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir); if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj); if (err != 0) goto out; } if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, &dp->dp_empty_bpobj); if (err != 0) goto out; } err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); if (err == ENOENT) err = 0; if (err) goto out; err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: rrw_exit(&dp->dp_config_rwlock, FTAG); return (err); } void dsl_pool_close(dsl_pool_t *dp) { /* * Drop our references from dsl_pool_open(). * * Since we held the origin_snap from "syncing" context (which * includes pool-opening context), it actually only got a "ref" * and not a hold, so just drop that here. */ if (dp->dp_origin_snap != NULL) dsl_dataset_rele(dp->dp_origin_snap, dp); if (dp->dp_mos_dir != NULL) dsl_dir_rele(dp->dp_mos_dir, dp); if (dp->dp_free_dir != NULL) dsl_dir_rele(dp->dp_free_dir, dp); if (dp->dp_leak_dir != NULL) dsl_dir_rele(dp->dp_leak_dir, dp); if (dp->dp_root_dir != NULL) dsl_dir_rele(dp->dp_root_dir, dp); bpobj_close(&dp->dp_free_bpobj); bpobj_close(&dp->dp_obsolete_bpobj); /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ if (dp->dp_meta_objset != NULL) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_early_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); taskq_destroy(dp->dp_zil_clean_taskq); spa_sync_tq_destroy(dp->dp_spa); - /* - * We can't set retry to TRUE since we're explicitly specifying - * a spa to flush. This is good enough; any missed buffers for - * this spa won't cause trouble, and they'll eventually fall - * out of the ARC just like any other unused buffer. - */ - arc_flush(dp->dp_spa, FALSE); + if (dp->dp_spa->spa_state == POOL_STATE_EXPORTED || + dp->dp_spa->spa_state == POOL_STATE_DESTROYED) { + /* + * On export/destroy perform the ARC flush asynchronously. + */ + arc_flush_async(dp->dp_spa); + } else { + /* + * We can't set retry to TRUE since we're explicitly specifying + * a spa to flush. This is good enough; any missed buffers for + * this spa won't cause trouble, and they'll eventually fall + * out of the ARC just like any other unused buffer. + */ + arc_flush(dp->dp_spa, FALSE); + } mmp_fini(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); dmu_buf_user_evict_wait(); rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); ASSERT0(aggsum_value(&dp->dp_wrlog_total)); aggsum_fini(&dp->dp_wrlog_total); for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i])); aggsum_fini(&dp->dp_wrlog_pertxg[i]); } taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_zrele_taskq); if (dp->dp_blkstats != NULL) vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); } void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t obj; /* * Currently, we only create the obsolete_bpobj where there are * indirect vdevs with referenced mappings. */ ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL)); /* create and open the obsolete_bpobj */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) { spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, tx)); bpobj_free(dp->dp_meta_objset, dp->dp_obsolete_bpobj.bpo_object, tx); bpobj_close(&dp->dp_obsolete_bpobj); } dsl_pool_t * dsl_pool_create(spa_t *spa, nvlist_t *zplprops __attribute__((unused)), dsl_crypto_params_t *dcp, uint64_t txg) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); #ifdef _KERNEL objset_t *os; #else objset_t *os __attribute__((unused)); #endif dsl_dataset_t *ds; uint64_t obj; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = dmu_objset_create_impl(spa, NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); spa->spa_meta_objset = dp->dp_meta_objset; /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); ASSERT0(err); /* Initialize scan structures */ VERIFY0(dsl_scan_init(dp, txg)); /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir)); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { /* create and open the free dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* create and open the free_bplist */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) dsl_pool_create_origin(dp, tx); /* * Some features may be needed when creating the root dataset, so we * create the feature objects here. */ if (spa_version(spa) >= SPA_VERSION_FEATURES) spa_feature_create_zap_objects(spa, tx); if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF && dcp->cp_crypt != ZIO_CRYPT_INHERIT) spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx); /* create the root dataset */ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx); /* create the root objset */ VERIFY0(dsl_dataset_hold_obj_flags(dp, obj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); #ifdef _KERNEL zfs_create_fs(os, kcred, zplprops, tx); #endif dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dmu_tx_commit(tx); rrw_exit(&dp->dp_config_rwlock, FTAG); return (dp); } /* * Account for the meta-objset space in its placeholder dsl_dir. */ void dsl_pool_mos_diduse_space(dsl_pool_t *dp, int64_t used, int64_t comp, int64_t uncomp) { ASSERT3U(comp, ==, uncomp); /* it's all metadata */ mutex_enter(&dp->dp_lock); dp->dp_mos_used_delta += used; dp->dp_mos_compressed_delta += comp; dp->dp_mos_uncompressed_delta += uncomp; mutex_exit(&dp->dp_lock); } static void dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) { zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(dp->dp_meta_objset, zio, tx); VERIFY0(zio_wait(zio)); dmu_objset_sync_done(dp->dp_meta_objset, tx); taskq_wait(dp->dp_sync_taskq); multilist_destroy(&dp->dp_meta_objset->os_synced_dnodes); dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } static void dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) { ASSERT(MUTEX_HELD(&dp->dp_lock)); if (delta < 0) ASSERT3U(-delta, <=, dp->dp_dirty_total); dp->dp_dirty_total += delta; /* * Note: we signal even when increasing dp_dirty_total. * This ensures forward progress -- each thread wakes the next waiter. */ if (dp->dp_dirty_total < zfs_dirty_data_max) cv_signal(&dp->dp_spaceavail_cv); } void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) { ASSERT3S(size, >=, 0); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size); aggsum_add(&dp->dp_wrlog_total, size); /* Choose a value slightly bigger than min dirty sync bytes */ uint64_t sync_min = zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200; if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) txg_kick(dp, txg); } boolean_t dsl_pool_need_wrlog_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0); } static void dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) { int64_t delta; delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); aggsum_add(&dp->dp_wrlog_total, delta); /* Compact per-CPU sums after the big change. */ (void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); (void) aggsum_value(&dp->dp_wrlog_total); } #ifdef ZFS_DEBUG static boolean_t dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) { spa_t *spa = dp->dp_spa; vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; txg_list_t *tl = &vd->vdev_ms_list; metaslab_t *ms; for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms; ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) { VERIFY(range_tree_is_empty(ms->ms_freeing)); VERIFY(range_tree_is_empty(ms->ms_checkpointing)); } } return (B_TRUE); } #else #define dsl_early_sync_task_verify(dp, txg) \ ((void) sizeof (dp), (void) sizeof (txg), B_TRUE) #endif void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { zio_t *rio; /* root zio for all dirty dataset syncs */ dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; objset_t *mos = dp->dp_meta_objset; list_t synced_datasets; list_create(&synced_datasets, sizeof (dsl_dataset_t), offsetof(dsl_dataset_t, ds_synced_link)); tx = dmu_tx_create_assigned(dp, txg); /* * Run all early sync tasks before writing out any dirty blocks. * For more info on early sync tasks see block comment in * dsl_early_sync_task(). */ if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) { dsl_sync_task_t *dst; ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); while ((dst = txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) { ASSERT(dsl_early_sync_task_verify(dp, txg)); dsl_sync_task_sync(dst, tx); } ASSERT(dsl_early_sync_task_verify(dp, txg)); } /* * Write out all dirty blocks of dirty datasets. Note, this could * create a very large (+10k) zio tree. */ rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { /* * We must not sync any non-MOS datasets twice, because * we may have taken a snapshot of them. However, we * may sync newly-created datasets on pass 2. */ ASSERT(!list_link_active(&ds->ds_synced_link)); list_insert_tail(&synced_datasets, ds); dsl_dataset_sync(ds, rio, tx); } VERIFY0(zio_wait(rio)); /* * Update the long range free counter after * we're done syncing user data */ mutex_enter(&dp->dp_lock); ASSERT(spa_sync_pass(dp->dp_spa) == 1 || dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; mutex_exit(&dp->dp_lock); /* * After the data blocks have been written (ensured by the zio_wait() * above), update the user/group/project space accounting. This happens * in tasks dispatched to dp_sync_taskq, so wait for them before * continuing. */ for (ds = list_head(&synced_datasets); ds != NULL; ds = list_next(&synced_datasets, ds)) { dmu_objset_sync_done(ds->ds_objset, tx); } taskq_wait(dp->dp_sync_taskq); /* * Sync the datasets again to push out the changes due to * userspace updates. This must be done before we process the * sync tasks, so that any snapshots will have the correct * user accounting information (and we won't get confused * about which blocks are part of the snapshot). */ rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { objset_t *os = ds->ds_objset; ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, rio, tx); /* * Release any key mappings created by calls to * dsl_dataset_dirty() from the userquota accounting * code paths. */ if (os->os_encrypted && !os->os_raw_receive && !os->os_next_write_raw[txg & TXG_MASK]) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } } VERIFY0(zio_wait(rio)); /* * Now that the datasets have been completely synced, we can * clean up our in-memory structures accumulated while syncing: * * - move dead blocks from the pending deadlist and livelists * to the on-disk versions * - release hold from dsl_dataset_dirty() * - release key mapping hold from dsl_dataset_dirty() */ while ((ds = list_remove_head(&synced_datasets)) != NULL) { objset_t *os = ds->ds_objset; if (os->os_encrypted && !os->os_raw_receive && !os->os_next_write_raw[txg & TXG_MASK]) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } dsl_dataset_sync_done(ds, tx); dmu_buf_rele(ds->ds_dbuf, ds); } while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { dsl_dir_sync(dd, tx); } /* * The MOS's space is accounted for in the pool/$MOS * (dp_mos_dir). We can't modify the mos while we're syncing * it, so we remember the deltas and apply them here. */ if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || dp->dp_mos_uncompressed_delta != 0) { dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, dp->dp_mos_used_delta, dp->dp_mos_compressed_delta, dp->dp_mos_uncompressed_delta, tx); dp->dp_mos_used_delta = 0; dp->dp_mos_compressed_delta = 0; dp->dp_mos_uncompressed_delta = 0; } if (dmu_objset_is_dirty(mos, txg)) { dsl_pool_sync_mos(dp, tx); } /* * We have written all of the accounted dirty data, so our * dp_space_towrite should now be zero. However, some seldom-used * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up * the accounting of any dirtied space now. * * Note that, besides any dirty data from datasets, the amount of * dirty data in the MOS is also accounted by the pool. Therefore, * we want to do this cleanup after dsl_pool_sync_mos() so we don't * attempt to update the accounting for the same dirty data twice. * (i.e. at this point we only update the accounting for the space * that we know that we "leaked"). */ dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); /* * If we modify a dataset in the same txg that we want to destroy it, * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. * dsl_dir_destroy_check() will fail if there are unexpected holds. * Therefore, we want to sync the MOS (thus syncing the dd_dbuf * and clearing the hold on it) before we process the sync_tasks. * The MOS data dirtied by the sync_tasks will be synced on the next * pass. */ if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { dsl_sync_task_t *dst; /* * No more sync tasks should have been added while we * were syncing. */ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) dsl_sync_task_sync(dst, tx); } dmu_tx_commit(tx); DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); } void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { zilog_t *zilog; while ((zilog = txg_list_head(&dp->dp_dirty_zilogs, txg))) { dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); /* * We don't remove the zilog from the dp_dirty_zilogs * list until after we've cleaned it. This ensures that * callers of zilog_is_dirty() receive an accurate * answer when they are racing with the spa sync thread. */ zil_clean(zilog, txg); (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg); ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } dsl_pool_wrlog_clear(dp, txg); ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } /* * TRUE if the current thread is the tx_sync_thread or if we * are being called from SPA context during pool initialization. */ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || spa_is_initializing(dp->dp_spa) || taskq_member(dp->dp_sync_taskq, curthread)); } /* * This function returns the amount of allocatable space in the pool * minus whatever space is currently reserved by ZFS for specific * purposes. Specifically: * * 1] Any reserved SLOP space * 2] Any space used by the checkpoint * 3] Any space used for deferred frees * * The latter 2 are especially important because they are needed to * rectify the SPA's and DMU's different understanding of how much space * is used. Now the DMU is aware of that extra space tracked by the SPA * without having to maintain a separate special dir (e.g similar to * $MOS, $FREEING, and $LEAKED). * * Note: By deferred frees here, we mean the frees that were deferred * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the * segments placed in ms_defer trees during metaslab_sync_done(). */ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy) { spa_t *spa = dp->dp_spa; uint64_t space, resv, adjustedsize; uint64_t spa_deferred_frees = spa->spa_deferred_bpobj.bpo_phys->bpo_bytes; space = spa_get_dspace(spa) - spa_get_checkpoint_space(spa) - spa_deferred_frees; resv = spa_get_slop_space(spa); switch (slop_policy) { case ZFS_SPACE_CHECK_NORMAL: break; case ZFS_SPACE_CHECK_RESERVED: resv >>= 1; break; case ZFS_SPACE_CHECK_EXTRA_RESERVED: resv >>= 2; break; case ZFS_SPACE_CHECK_NONE: resv = 0; break; default: panic("invalid slop policy value: %d", slop_policy); break; } adjustedsize = (space >= resv) ? (space - resv) : 0; return (adjustedsize); } uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy) { uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy); uint64_t deferred = metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0; return (quota); } uint64_t dsl_pool_deferred_space(dsl_pool_t *dp) { return (metaslab_class_get_deferred(spa_normal_class(dp->dp_spa))); } boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; /* * We are not taking the dp_lock here and few other places, since torn * reads are unlikely: on 64-bit systems due to register size and on * 32-bit due to memory constraints. Pool-wide locks in hot path may * be too expensive, while we do not need a precise result here. */ return (dp->dp_dirty_total > delay_min_bytes); } static boolean_t dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) { uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; return (dirty > dirty_min_bytes); } void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { if (space > 0) { mutex_enter(&dp->dp_lock); dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; dsl_pool_dirty_delta(dp, space); boolean_t needsync = !dmu_tx_is_syncing(tx) && dsl_pool_need_dirty_sync(dp, tx->tx_txg); mutex_exit(&dp->dp_lock); if (needsync) txg_kick(dp, tx->tx_txg); } } void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) { ASSERT3S(space, >=, 0); if (space == 0) return; mutex_enter(&dp->dp_lock); if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { /* XXX writing something we didn't dirty? */ space = dp->dp_dirty_pertxg[txg & TXG_MASK]; } ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; ASSERT3U(dp->dp_dirty_total, >=, space); dsl_pool_dirty_delta(dp, -space); mutex_exit(&dp->dp_lock); } static int upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds, *prev = NULL; int err; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) break; dsl_dataset_rele(ds, FTAG); ds = prev; prev = NULL; } if (prev == NULL) { prev = dp->dp_origin_snap; /* * The $ORIGIN can't have any data, or the accounting * will be wrong. */ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp)); rrw_exit(&ds->ds_bp_rwlock, FTAG); /* The origin doesn't get attached to itself */ if (ds->ds_object == prev->ds_object) { dsl_dataset_rele(ds, FTAG); return (0); } dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; dsl_dataset_phys(ds)->ds_prev_snap_txg = dsl_dataset_phys(prev)->ds_creation_txg; dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; dmu_buf_will_dirty(prev->ds_dbuf, tx); dsl_dataset_phys(prev)->ds_num_children++; if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { ASSERT(ds->ds_prev == NULL); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); } } ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { dmu_buf_will_dirty(prev->ds_dbuf, tx); dsl_dataset_phys(prev)->ds_next_clones_obj = zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); dsl_dataset_rele(ds, FTAG); if (prev != dp->dp_origin_snap) dsl_dataset_rele(prev, FTAG); return (0); } void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap != NULL); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } static int upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { dmu_tx_t *tx = arg; objset_t *mos = dp->dp_meta_objset; if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { dsl_dataset_t *origin; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); dsl_dir_phys(origin->ds_dir)->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(origin->ds_dir)->dd_clones, ds->ds_object, tx)); dsl_dataset_rele(origin, FTAG); } return (0); } void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t obj; ASSERT(dmu_tx_is_syncing(tx)); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* * We can't use bpobj_alloc(), because spa_version() still * returns the old version, and we need a new-version bpobj with * subobj support. So call dmu_object_alloc() directly. */ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t dsobj; dsl_dataset_t *ds; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap == NULL); ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); /* create the origin dir, ds, & snap-ds */ dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, NULL, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); } taskq_t * dsl_pool_zrele_taskq(dsl_pool_t *dp) { return (dp->dp_zrele_taskq); } taskq_t * dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp) { return (dp->dp_unlinked_drain_taskq); } /* * Walk through the pool-wide zap object of temporary snapshot user holds * and release them. */ void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) { zap_attribute_t *za; zap_cursor_t zc; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; nvlist_t *holds; if (zapobj == 0) return; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); holds = fnvlist_alloc(); za = zap_attribute_alloc(); for (zap_cursor_init(&zc, mos, zapobj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { char *htag; nvlist_t *tags; htag = strchr(za->za_name, '-'); *htag = '\0'; ++htag; if (nvlist_lookup_nvlist(holds, za->za_name, &tags) != 0) { tags = fnvlist_alloc(); fnvlist_add_boolean(tags, htag); fnvlist_add_nvlist(holds, za->za_name, tags); fnvlist_free(tags); } else { fnvlist_add_boolean(tags, htag); } } dsl_dataset_user_release_tmp(dp, holds); fnvlist_free(holds); zap_cursor_fini(&zc); zap_attribute_free(za); } /* * Create the pool-wide zap object for storing temporary snapshot holds. */ static void dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) { objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; char *name; int error; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); ASSERT(dmu_tx_is_syncing(tx)); /* * If the pool was created prior to SPA_VERSION_USERREFS, the * zap object for temporary holds might not exist yet. */ if (zapobj == 0) { if (holding) { dsl_pool_user_hold_create_obj(dp, tx); zapobj = dp->dp_tmp_userrefs_obj; } else { return (SET_ERROR(ENOENT)); } } name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); if (holding) error = zap_add(mos, zapobj, name, 8, 1, &now, tx); else error = zap_remove(mos, zapobj, name, tx); kmem_strfree(name); return (error); } /* * Add a temporary hold for the given dataset object and tag. */ int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t now, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); } /* * Release a temporary hold for the given dataset object and tag. */ int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE)); } /* * DSL Pool Configuration Lock * * The dp_config_rwlock protects against changes to DSL state (e.g. dataset * creation / destruction / rename / property setting). It must be held for * read to hold a dataset or dsl_dir. I.e. you must call * dsl_pool_config_enter() or dsl_pool_hold() before calling * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock * must be held continuously until all datasets and dsl_dirs are released. * * The only exception to this rule is that if a "long hold" is placed on * a dataset, then the dp_config_rwlock may be dropped while the dataset * is still held. The long hold will prevent the dataset from being * destroyed -- the destroy will fail with EBUSY. A long hold can be * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset * (by calling dsl_{dataset,objset}_{try}own{_obj}). * * Legitimate long-holders (including owners) should be long-running, cancelable * tasks that should cause "zfs destroy" to fail. This includes DMU * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), * "zfs send", and "zfs diff". There are several other long-holders whose * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). * * The usual formula for long-holding would be: * dsl_pool_hold() * dsl_dataset_hold() * ... perform checks ... * dsl_dataset_long_hold() * dsl_pool_rele() * ... perform long-running task ... * dsl_dataset_long_rele() * dsl_dataset_rele() * * Note that when the long hold is released, the dataset is still held but * the pool is not held. The dataset may change arbitrarily during this time * (e.g. it could be destroyed). Therefore you shouldn't do anything to the * dataset except release it. * * Operations generally fall somewhere into the following taxonomy: * * Read-Only Modifying * * Dataset Layer / MOS zfs get zfs destroy * * Individual Dataset read() write() * * * Dataset Layer Operations * * Modifying operations should generally use dsl_sync_task(). The synctask * infrastructure enforces proper locking strategy with respect to the * dp_config_rwlock. See the comment above dsl_sync_task() for details. * * Read-only operations will manually hold the pool, then the dataset, obtain * information from the dataset, then release the pool and dataset. * dmu_objset_{hold,rele}() are convenience routines that also do the pool * hold/rele. * * * Operations On Individual Datasets * * Objects _within_ an objset should only be modified by the current 'owner' * of the objset to prevent incorrect concurrent modification. Thus, use * {dmu_objset,dsl_dataset}_own to mark some entity as the current owner, * and fail with EBUSY if there is already an owner. The owner can then * implement its own locking strategy, independent of the dataset layer's * locking infrastructure. * (E.g., the ZPL has its own set of locks to control concurrency. A regular * vnop will not reach into the dataset layer). * * Ideally, objects would also only be read by the objset’s owner, so that we * don’t observe state mid-modification. * (E.g. the ZPL is creating a new object and linking it into a directory; if * you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an * intermediate state. The ioctl level violates this but in pretty benign * ways, e.g. reading the zpl props object.) */ int dsl_pool_hold(const char *name, const void *tag, dsl_pool_t **dp) { spa_t *spa; int error; error = spa_open(name, &spa, tag); if (error == 0) { *dp = spa_get_dsl(spa); dsl_pool_config_enter(*dp, tag); } return (error); } void dsl_pool_rele(dsl_pool_t *dp, const void *tag) { dsl_pool_config_exit(dp, tag); spa_close(dp->dp_spa, tag); } void dsl_pool_config_enter(dsl_pool_t *dp, const void *tag) { /* * We use a "reentrant" reader-writer lock, but not reentrantly. * * The rrwlock can (with the track_all flag) track all reading threads, * which is very useful for debugging which code path failed to release * the lock, and for verifying that the *current* thread does hold * the lock. * * (Unlike a rwlock, which knows that N threads hold it for * read, but not *which* threads, so rw_held(RW_READER) returns TRUE * if any thread holds it for read, even if this thread doesn't). */ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); } void dsl_pool_config_enter_prio(dsl_pool_t *dp, const void *tag) { ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); rrw_enter_read_prio(&dp->dp_config_rwlock, tag); } void dsl_pool_config_exit(dsl_pool_t *dp, const void *tag) { rrw_exit(&dp->dp_config_rwlock, tag); } boolean_t dsl_pool_config_held(dsl_pool_t *dp) { return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); } boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp) { return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); } EXPORT_SYMBOL(dsl_pool_config_enter); EXPORT_SYMBOL(dsl_pool_config_exit); /* zfs_dirty_data_max_percent only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, UINT, ZMOD_RD, "Max percent of RAM allowed to be dirty"); /* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, UINT, ZMOD_RD, "zfs_dirty_data_max upper bound as % of RAM"); ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, UINT, ZMOD_RW, "Transaction delay threshold"); ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, U64, ZMOD_RW, "Determines the dirty space limit"); ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, U64, ZMOD_RW, "The size limit of write-transaction zil log data"); /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, U64, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW, "Dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW, "How quickly delay approaches infinity"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW, "Max percent of CPUs that are used per dp_sync_taskq"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW, "Number of taskq entries that are pre-populated"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW, "Max number of taskq entries that are cached"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index d228315ade36..a2b887962270 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1,3151 +1,3179 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2023, 2024, Klara Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include #include #include #include /* * SPA locking * * There are three basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * * This lock must be acquired to do any of the following: * * - Lookup a spa_t by name * - Add or remove a spa_t from the namespace * - Increase spa_refcount from non-zero * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices * - Held for the duration of create/destroy * - Held at the start and end of import and export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by * definition they must have an existing reference, and will never need * to lookup a spa_t by name. * * spa_refcount (per-spa zfs_refcount_t protected by mutex) * * This reference count keep track of any active users of the spa_t. The * spa_t cannot be destroyed or freed while this is non-zero. Internally, * the refcount is never really 'zero' - opening a pool implicitly keeps * some references in the DMU. Internally we check against spa_minref, but * present the image of a zero/non-zero value to consumers. * * spa_config_lock[] (per-spa array of rwlocks) * * This protects the spa_t from config changes, and must be held in * the following circumstances: * * - RW_READER to perform I/O to the spa * - RW_WRITER to change the vdev config * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount * * The namespace lock must be acquired to increase the refcount from 0 * or to check if it is zero. * * spa_refcount -> spa_config_lock[] * * There must be at least one valid reference on the spa_t to acquire * the config lock. * * spa_namespace_lock -> spa_config_lock[] * * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock can be acquired directly and is globally visible. * * The namespace is manipulated using the following functions, all of which * require the spa_namespace_lock to be held. * * spa_lookup() Lookup a spa_t by name. * * spa_add() Create a new spa_t in the namespace. * * spa_remove() Remove a spa_t from the namespace. This also * frees up any memory associated with the spa_t. * * spa_next() Returns the next spa_t in the system, or the * first if NULL is passed. * * spa_evict_all() Shutdown and remove all spa_t structures in * the system. * * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * * spa_open_ref() Adds a reference to the given spa_t. Must be * called with spa_namespace_lock held if the * refcount is currently zero. * * spa_close() Remove a reference from the spa_t. This will * not free the spa_t or remove it from the * namespace. No locking is required. * * spa_refcount_zero() Returns true if the refcount is currently * zero. Must be called with spa_namespace_lock * held. * * The spa_config_lock[] is an array of rwlocks, ordered as follows: * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). * * To read the configuration, it suffices to hold one of these locks as reader. * To modify the configuration, you must hold all locks as writer. To modify * vdev state without altering the vdev tree's topology (e.g. online/offline), * you must hold SCL_STATE and SCL_ZIO as writer. * * We use these distinct config locks to avoid recursive lock entry. * For example, spa_sync() (which holds SCL_CONFIG as reader) induces * block allocations (SCL_ALLOC), which may require reading space maps * from disk (dmu_read() -> zio_read() -> SCL_ZIO). * * The spa config locks cannot be normal rwlocks because we need the * ability to hand off ownership. For example, SCL_ZIO is acquired * by the issuing thread and later released by an interrupt thread. * They do, however, obey the usual write-wanted semantics to prevent * writer (i.e. system administrator) starvation. * * The lock acquisition rules are as follows: * * SCL_CONFIG * Protects changes to the vdev tree topology, such as vdev * add/remove/attach/detach. Protects the dirty config list * (spa_config_dirty_list) and the set of spares and l2arc devices. * * SCL_STATE * Protects changes to pool state and vdev state, such as vdev * online/offline/fault/degrade/clear. Protects the dirty state list * (spa_state_dirty_list) and global pool state (spa_state). * * SCL_ALLOC * Protects changes to metaslab groups and classes. * Held as reader by metaslab_alloc() and metaslab_claim(). * * SCL_ZIO * Held by bp-level zios (those which have no io_vd upon entry) * to prevent changes to the vdev tree. The bp-level zio implicitly * protects all of its vdev child zios, which do not hold SCL_ZIO. * * SCL_FREE * Protects changes to metaslab groups and classes. * Held as reader by metaslab_free(). SCL_FREE is distinct from * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free * blocks in zio_done() while another i/o that holds either * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * * In addition, the following rules apply: * * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. * The lock ordering is SCL_CONFIG > spa_props_lock. * * (b) I/O operations on leaf vdevs. For any zio operation that takes * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), * or zio_write_phys() -- the caller must ensure that the config cannot * cannot change in the interim, and that the vdev cannot be reopened. * SCL_STATE as reader suffices for both. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * * spa_vdev_enter() Acquire the namespace lock and the config lock * for writing. * * spa_vdev_exit() Release the config lock, wait for all I/O * to complete, sync the updated configs to the * cache, and release the namespace lock. * * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; kcondvar_t spa_namespace_cv; static const int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG /* * Everything except dprintf, set_error, spa, and indirect_remap is on * by default in debug builds. */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | ZFS_DEBUG_INDIRECT_REMAP); #else int zfs_flags = 0; #endif /* * zfs_recover can be set to nonzero to attempt to recover from * otherwise-fatal errors, typically caused by on-disk corruption. When * set, calls to zfs_panic_recover() will turn into warning messages. * This should only be used as a last resort, as it typically results * in leaked space, or worse. */ int zfs_recover = B_FALSE; /* * If destroy encounters an EIO while reading metadata (e.g. indirect * blocks), space referenced by the missing metadata can not be freed. * Normally this causes the background destroy to become "stalled", as * it is unable to make forward progress. While in this stalled state, * all remaining space to free from the error-encountering filesystem is * "temporarily leaked". Set this flag to cause it to ignore the EIO, * permanently leak the space from indirect blocks that can not be read, * and continue to free everything else that it can. * * The default, "stalling" behavior is useful if the storage partially * fails (i.e. some but not all i/os fail), and then later recovers. In * this case, we will be able to continue pool operations while it is * partially failed, and when it recovers, we can continue to free the * space, with no leaks. However, note that this case is actually * fairly rare. * * Typically pools either (a) fail completely (but perhaps temporarily, * e.g. a top-level vdev going offline), or (b) have localized, * permanent errors (e.g. disk returns the wrong data due to bit flip or * firmware bug). In case (a), this setting does not matter because the * pool will be suspended and the sync thread will not be able to make * forward progress regardless. In case (b), because the error is * permanent, the best we can do is leak the minimum amount of space, * which is what setting this flag will do. Therefore, it is reasonable * for this flag to normally be set, but we chose the more conservative * approach of not setting it, so that there is no possibility of * leaking space in the "partial temporary" failure case. */ int zfs_free_leak_on_eio = B_FALSE; /* * Expiration time in milliseconds. This value has two meanings. First it is * used to determine when the spa_deadman() logic should fire. By default the * spa_deadman() will fire if spa_sync() has not completed in 600 seconds. * Secondly, the value determines if an I/O is considered "hung". Any I/O that * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in one of three behaviors controlled by zfs_deadman_failmode. */ uint64_t zfs_deadman_synctime_ms = 600000UL; /* 10 min. */ /* * This value controls the maximum amount of time zio_wait() will block for an * outstanding IO. By default this is 300 seconds at which point the "hung" * behavior will be applied as described for zfs_deadman_synctime_ms. */ uint64_t zfs_deadman_ziotime_ms = 300000UL; /* 5 min. */ /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ uint64_t zfs_deadman_checktime_ms = 60000UL; /* 1 min. */ /* * By default the deadman is enabled. */ int zfs_deadman_enabled = B_TRUE; /* * Controls the behavior of the deadman when it detects a "hung" I/O. * Valid values are zfs_deadman_failmode=. * * wait - Wait for the "hung" I/O (default) * continue - Attempt to recover from a "hung" I/O * panic - Panic the system */ const char *zfs_deadman_failmode = "wait"; /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) * times the size; so just assume that. Add to this the fact that * we can have up to 3 DVAs per bp, and one more factor of 2 because * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, * the worst case is: * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 */ uint_t spa_asize_inflation = 24; /* * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in * the pool to be consumed (bounded by spa_max_slop). This ensures that we * don't run the pool completely out of space, due to unaccounted changes (e.g. * to the MOS). It also limits the worst-case time to allocate space. If we * have less than this amount of free space, most ZPL operations (e.g. write, * create) will return ENOSPC. The ZIL metaslabs (spa_embedded_log_class) are * also part of this 3.2% of space which can't be consumed by normal writes; * the slop space "proper" (spa_get_slop_space()) is decreased by the embedded * log space. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half * the slop space is free. Typically, once the pool has less than the slop * space free, the user will use these operations to free up space in the pool. * These are the operations that call dsl_pool_adjustedsize() with the netfree * argument set to TRUE. * * Operations that are almost guaranteed to free up space in the absence of * a pool checkpoint can use up to three quarters of the slop space * (e.g zfs destroy). * * A very restricted set of operations are always permitted, regardless of * the amount of free space. These are the operations that call * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net * increase in the amount of space used, it is possible to run the pool * completely out of space, causing it to be permanently read-only. * * Note that on very small pools, the slop space will be larger than * 3.2%, in an effort to have it be at least spa_min_slop (128MB), * but we never allow it to be more than half the pool size. * * Further, on very large pools, the slop space will be smaller than * 3.2%, to avoid reserving much more space than we actually need; bounded * by spa_max_slop (128GB). * * See also the comments in zfs_space_check_t. */ uint_t spa_slop_shift = 5; static const uint64_t spa_min_slop = 128ULL * 1024 * 1024; static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; /* * Number of allocators to use, per spa instance */ static int spa_num_allocators = 4; static int spa_cpus_per_allocator = 4; /* * Spa active allocator. * Valid values are zfs_active_allocator=. */ const char *zfs_active_allocator = "dynamic"; void spa_load_failed(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); } void spa_load_note(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); spa_import_progress_set_notes_nolog(spa, "%s", buf); } /* * By default dedup and user data indirects land in the special class */ static int zfs_ddt_data_is_special = B_TRUE; static int zfs_user_indirect_is_special = B_TRUE; /* * The percentage of special class final space reserved for metadata only. * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only * let metadata into the class. */ static uint_t zfs_special_class_metadata_reserve_pct = 25; /* * ========================================================================== * SPA config locking * ========================================================================== */ static void spa_config_lock_init(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); scl->scl_writer = NULL; scl->scl_write_wanted = 0; scl->scl_count = 0; } } static void spa_config_lock_destroy(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); ASSERT(scl->scl_writer == NULL); ASSERT(scl->scl_write_wanted == 0); ASSERT(scl->scl_count == 0); } } int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { if (scl->scl_writer || scl->scl_write_wanted) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } } else { ASSERT(scl->scl_writer != curthread); if (scl->scl_count != 0) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } scl->scl_writer = curthread; } scl->scl_count++; mutex_exit(&scl->scl_lock); } return (1); } static void spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, int mmp_flag) { (void) tag; int wlocks_held = 0; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || (!mmp_flag && scl->scl_write_wanted)) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (scl->scl_count != 0) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } scl->scl_count++; mutex_exit(&scl->scl_lock); } ASSERT3U(wlocks_held, <=, locks); } void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 0); } /* * The spa_config_enter_mmp() allows the mmp thread to cut in front of * outstanding write lock requests. This is needed since the mmp updates are * time sensitive and failure to service them promptly will result in a * suspended pool. This pool suspension has been seen in practice when there is * a single disk in a pool that is responding slowly and presumably about to * fail. */ void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 1); } void spa_config_exit(spa_t *spa, int locks, const void *tag) { (void) tag; for (int i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); ASSERT(scl->scl_count > 0); if (--scl->scl_count == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ cv_broadcast(&scl->scl_cv); } mutex_exit(&scl->scl_lock); } } int spa_config_held(spa_t *spa, int locks, krw_t rw) { int locks_held = 0; for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; if ((rw == RW_READER && scl->scl_count != 0) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } return (locks_held); } /* * ========================================================================== * SPA namespace functions * ========================================================================== */ /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. */ spa_t * spa_lookup(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* * If it's a full dataset name, figure out the pool name and * just use that. */ cp = strpbrk(search.spa_name, "/@#"); if (cp != NULL) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); if (spa == NULL) return (NULL); /* * Avoid racing with import/export, which don't hold the namespace * lock for their entire duration. */ if ((spa->spa_load_thread != NULL && spa->spa_load_thread != curthread) || (spa->spa_export_thread != NULL && spa->spa_export_thread != curthread)) { cv_wait(&spa_namespace_cv, &spa_namespace_lock); goto retry; } return (spa); } /* * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. * If the zfs_deadman_enabled flag is set then it inspects all vdev queues * looking for potentially hung I/Os. */ void spa_deadman(void *arg) { spa_t *spa = arg; /* Disable the deadman if the pool is suspended. */ if (spa_suspended(spa)) return; zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", (gethrtime() - spa->spa_sync_starttime) / NANOSEC, (u_longlong_t)++spa->spa_deadman_calls); if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev, FTAG); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + MSEC_TO_TICK(zfs_deadman_checktime_ms)); } static int spa_log_sm_sort_by_txg(const void *va, const void *vb) { const spa_log_sm_t *a = va; const spa_log_sm_t *b = vb; return (TREE_CMP(a->sls_txg, b->sls_txg)); } /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. */ spa_t * spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; spa->spa_trust_config = B_TRUE; spa->spa_hostid = zone_get_hostid(NULL); spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); spa_set_deadman_failmode(spa, zfs_deadman_failmode); spa_set_allocator(spa, zfs_active_allocator); zfs_refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); spa_stats_init(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); avl_add(&spa_namespace_avl, spa); /* * Set the alternate root, if there is one. */ if (altroot) spa->spa_root = spa_strdup(altroot); /* Do not allow more allocators than fraction of CPUs. */ spa->spa_alloc_count = MAX(MIN(spa_num_allocators, boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1); spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * sizeof (spa_alloc_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } if (spa->spa_alloc_count > 1) { spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t, sau_inuse[spa->spa_alloc_count]), KM_SLEEP); mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT, NULL); } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node)); list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t), offsetof(log_summary_entry_t, lse_node)); /* * Every pool starts with the default cachefile */ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (config != NULL) { nvlist_t *features; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) == 0) { VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); } if (spa->spa_label_features == NULL) { VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, KM_SLEEP) == 0); } spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; /* * As a pool is being created, treat all features as disabled by * setting SPA_FEATURE_DISABLED for all entries in the feature * refcount cache. */ for (int i = 0; i < SPA_FEATURES; i++) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } list_create(&spa->spa_leaf_list, sizeof (vdev_t), offsetof(vdev_t, vdev_leaf_node)); return (spa); } /* * Removes a spa_t from the namespace, freeing up any memory used. Requires * spa_namespace_lock. This is called only after the spa_t has been closed and * deactivated. */ void spa_remove(spa_t *spa) { spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED); ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0); ASSERT0(spa->spa_waiters); nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); if (spa->spa_root) spa_strfree(spa->spa_root); while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); } for (int i = 0; i < spa->spa_alloc_count; i++) { avl_destroy(&spa->spa_allocs[i].spaa_tree); mutex_destroy(&spa->spa_allocs[i].spaa_lock); } kmem_free(spa->spa_allocs, spa->spa_alloc_count * sizeof (spa_alloc_t)); if (spa->spa_alloc_count > 1) { mutex_destroy(&spa->spa_allocs_use->sau_lock); kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t, sau_inuse[spa->spa_alloc_count])); } avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_sm_logs_by_txg); list_destroy(&spa->spa_log_summary); list_destroy(&spa->spa_config_list); list_destroy(&spa->spa_leaf_list); nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); nvlist_free(spa->spa_feat_stats); spa_config_set(spa, NULL); zfs_refcount_destroy(&spa->spa_refcount); spa_stats_destroy(spa); spa_config_lock_destroy(spa); for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); zio_checksum_templates_free(spa); cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); cv_destroy(&spa->spa_activities_cv); cv_destroy(&spa->spa_waiters_cv); mutex_destroy(&spa->spa_flushed_ms_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); mutex_destroy(&spa->spa_cksum_tmpls_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); mutex_destroy(&spa->spa_activities_lock); kmem_free(spa, sizeof (spa_t)); } /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. */ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (prev) return (AVL_NEXT(&spa_namespace_avl, prev)); else return (avl_first(&spa_namespace_avl)); } /* * ========================================================================== * SPA refcount functions * ========================================================================== */ /* * Add a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_open_ref(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || MUTEX_HELD(&spa_namespace_lock) || spa->spa_load_thread == curthread); (void) zfs_refcount_add(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held or be part of a pool import/export. */ void spa_close(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock) || spa->spa_load_thread == curthread || spa->spa_export_thread == curthread); (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t held by a dsl dir that is * being asynchronously released. Async releases occur from a taskq * performing eviction of dsl datasets and dirs. The namespace lock * isn't held and the hold by the object being evicted may contribute to * spa_minref (e.g. dataset or directory released during pool export), * so the asserts in spa_close() do not apply. */ void spa_async_close(spa_t *spa, const void *tag) { (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held or be the spa export thread. We really * compare against spa_minref, which is the number of references * acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_export_thread == curthread); return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); } /* * ========================================================================== * SPA spare and l2cache tracking * ========================================================================== */ /* * Hot spares and cache devices are tracked using the same code below, * for 'auxiliary' devices. */ typedef struct spa_aux { uint64_t aux_guid; uint64_t aux_pool; avl_node_t aux_avl; int aux_count; } spa_aux_t; static inline int spa_aux_compare(const void *a, const void *b) { const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; return (TREE_CMP(sa->aux_guid, sb->aux_guid)); } static void spa_aux_add(vdev_t *vd, avl_tree_t *avl) { avl_index_t where; spa_aux_t search; spa_aux_t *aux; search.aux_guid = vd->vdev_guid; if ((aux = avl_find(avl, &search, &where)) != NULL) { aux->aux_count++; } else { aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); aux->aux_guid = vd->vdev_guid; aux->aux_count = 1; avl_insert(avl, aux, where); } } static void spa_aux_remove(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search; spa_aux_t *aux; avl_index_t where; search.aux_guid = vd->vdev_guid; aux = avl_find(avl, &search, &where); ASSERT(aux != NULL); if (--aux->aux_count == 0) { avl_remove(avl, aux); kmem_free(aux, sizeof (spa_aux_t)); } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { aux->aux_pool = 0ULL; } } static boolean_t spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) { spa_aux_t search, *found; search.aux_guid = guid; found = avl_find(avl, &search, NULL); if (pool) { if (found) *pool = found->aux_pool; else *pool = 0ULL; } if (refcnt) { if (found) *refcnt = found->aux_count; else *refcnt = 0; } return (found != NULL); } static void spa_aux_activate(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search, *found; avl_index_t where; search.aux_guid = vd->vdev_guid; found = avl_find(avl, &search, &where); ASSERT(found != NULL); ASSERT(found->aux_pool == 0ULL); found->aux_pool = spa_guid(vd->vdev_spa); } /* * Spares are tracked globally due to the following constraints: * * - A spare may be part of multiple pools. * - A spare may be added to a pool even if it's actively in use within * another pool. * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference * counted AVL tree. When a vdev is added as a spare, or used as a replacement * spare, then we bump the reference count in the AVL tree. In addition, we set * the 'vdev_isspare' member to indicate that the device is a spare (active or * inactive). When a spare is made active (used to replace a device in the * pool), we also keep track of which pool its been made a part of. * * The 'spa_spare_lock' protects the AVL tree. These functions are normally * called under the spa_namespace lock as part of vdev reconfiguration. The * separate spare lock exists for the status query path, which does not need to * be completely consistent with respect to other vdev configuration changes. */ static int spa_spare_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) { boolean_t found; mutex_enter(&spa_spare_lock); found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); mutex_exit(&spa_spare_lock); return (found); } void spa_spare_activate(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_activate(vd, &spa_spare_avl); mutex_exit(&spa_spare_lock); } /* * Level 2 ARC devices are tracked globally for the same reasons as spares. * Cache devices currently only support one pool per cache device, and so * for these devices the aux reference count is currently unused beyond 1. */ static int spa_l2cache_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_l2cache_add(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(!vd->vdev_isl2cache); spa_aux_add(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_TRUE; mutex_exit(&spa_l2cache_lock); } void spa_l2cache_remove(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_remove(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_FALSE; mutex_exit(&spa_l2cache_lock); } boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool) { boolean_t found; mutex_enter(&spa_l2cache_lock); found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); return (found); } void spa_l2cache_activate(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_activate(vd, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); } /* * ========================================================================== * SPA vdev locking * ========================================================================== */ /* * Lock the given spa_t for the purpose of adding or removing a vdev. * Grabs the global spa_namespace_lock plus the spa config lock for writing. * It returns the next transaction group for the spa_t. */ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); ASSERT0(spa->spa_export_thread); vdev_autotrim_stop_all(spa); return (spa_vdev_config_enter(spa)); } /* * The same as spa_vdev_enter() above but additionally takes the guid of * the vdev being detached. When there is a rebuild in process it will be * suspended while the vdev tree is modified then resumed by spa_vdev_exit(). * The rebuild is canceled if only a single child remains after the detach. */ uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); ASSERT0(spa->spa_export_thread); vdev_autotrim_stop_all(spa); if (guid != 0) { vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd) { vdev_rebuild_stop_wait(vd->vdev_top); } } return (spa_vdev_config_enter(spa)); } /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. */ uint64_t spa_vdev_config_enter(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); return (spa_last_synced_txg(spa) + 1); } /* * Used in combination with spa_vdev_config_enter() to allow the syncing * of multiple transactions without releasing the spa_namespace_lock. */ void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, const char *tag) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); int config_changed = B_FALSE; ASSERT(txg > spa_last_synced_txg(spa)); spa->spa_pending_vdev = NULL; /* * Reassess the DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; } /* * Verify the metaslab classes. */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); spa_config_exit(spa, SCL_ALL, spa); /* * Panic the system if the specified tag requires it. This * is useful for ensuring that configurations are updated * transactionally. */ if (zio_injection_enabled) zio_handle_panic_injection(spa, tag, 0); /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. * This allows us to use the txg as the generation number. */ if (error == 0) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); if (vd->vdev_ops->vdev_op_leaf) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, NULL); mutex_exit(&vd->vdev_initialize_lock); mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); mutex_exit(&vd->vdev_trim_lock); } /* * The vdev may be both a leaf and top-level device. */ vdev_autotrim_stop_wait(vd); spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_STATE_ALL, spa); } /* * If the config changed, update the config cache. */ if (config_changed) spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); } /* * Unlock the spa_t after adding or removing a vdev. Besides undoing the * locking of spa_vdev_enter(), we also want make sure the transactions have * synced to disk, and then update the global configuration cache with the new * information. */ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { vdev_autotrim_restart(spa); vdev_rebuild_restart(spa); spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Lock the given spa_t for the purpose of changing vdev state. */ void spa_vdev_state_enter(spa_t *spa, int oplocks) { int locks = SCL_STATE_ALL | oplocks; /* * Root pools may need to read of the underlying devfs filesystem * when opening up a vdev. Unfortunately if we're holding the * SCL_ZIO lock it will result in a deadlock when we try to issue * the read from the root filesystem. Instead we "prefetch" * the associated vnodes that we need prior to opening the * underlying devices and cache them so that we can prevent * any I/O when we are doing the actual open. */ if (spa_is_root(spa)) { int low = locks & ~(SCL_ZIO - 1); int high = locks & ~low; spa_config_enter(spa, high, spa, RW_WRITER); vdev_hold(spa->spa_root_vdev); spa_config_enter(spa, low, spa, RW_WRITER); } else { spa_config_enter(spa, locks, spa, RW_WRITER); } spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { boolean_t config_changed = B_FALSE; vdev_t *vdev_top; if (vd == NULL || vd == spa->spa_root_vdev) { vdev_top = spa->spa_root_vdev; } else { vdev_top = vd->vdev_top; } if (vd != NULL || error == 0) vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE); if (vd != NULL) { if (vd != spa->spa_root_vdev) vdev_state_dirty(vdev_top); config_changed = B_TRUE; spa->spa_config_generation++; } if (spa_is_root(spa)) vdev_rele(spa->spa_root_vdev); ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that, * from the system administrator's perspective, zpool(8) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); /* * If the config changed, update the config cache. */ if (config_changed) { mutex_enter(&spa_namespace_lock); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); mutex_exit(&spa_namespace_lock); } return (error); } /* * ========================================================================== * Miscellaneous functions * ========================================================================== */ void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) { if (!nvlist_exists(spa->spa_label_features, feature)) { fnvlist_add_boolean(spa->spa_label_features, feature); /* * When we are creating the pool (tx_txg==TXG_INITIAL), we can't * dirty the vdev config because lock SCL_CONFIG is not held. * Thankfully, in this case we don't need to dirty the config * because it will be written out anyway when we finish * creating the pool. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); } } void spa_deactivate_mos_feature(spa_t *spa, const char *feature) { if (nvlist_remove_all(spa->spa_label_features, feature) == 0) vdev_config_dirty(spa->spa_root_vdev); } /* * Return the spa_t associated with given pool_guid, if it exists. If * device_guid is non-zero, determine whether the pool exists *and* contains * a device with the specified device_guid. */ spa_t * spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) continue; if (spa->spa_root_vdev == NULL) continue; if (spa_guid(spa) == pool_guid) { if (device_guid == 0) break; if (vdev_lookup_by_guid(spa->spa_root_vdev, device_guid) != NULL) break; /* * Check any devices we may be in the process of adding. */ if (spa->spa_pending_vdev) { if (vdev_lookup_by_guid(spa->spa_pending_vdev, device_guid) != NULL) break; } } } return (spa); } /* * Determine whether a pool with the given pool_guid exists. */ boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) { return (spa_by_guid(pool_guid, device_guid) != NULL); } char * spa_strdup(const char *s) { size_t len; char *new; len = strlen(s); new = kmem_alloc(len + 1, KM_SLEEP); memcpy(new, s, len + 1); return (new); } void spa_strfree(char *s) { kmem_free(s, strlen(s) + 1); } uint64_t spa_generate_guid(spa_t *spa) { uint64_t guid; if (spa != NULL) { do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)); } else { do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_guid_exists(guid, 0)); } return (guid); } +static boolean_t +spa_load_guid_exists(uint64_t guid) +{ + avl_tree_t *t = &spa_namespace_avl; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { + if (spa_load_guid(spa) == guid) + return (B_TRUE); + } + + return (arc_async_flush_guid_inuse(guid)); +} + +uint64_t +spa_generate_load_guid(void) +{ + uint64_t guid; + + do { + (void) random_get_pseudo_bytes((void *)&guid, + sizeof (guid)); + } while (guid == 0 || spa_load_guid_exists(guid)); + + return (guid); +} + void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { char type[256]; const char *checksum = NULL; const char *compress = NULL; if (bp != NULL) { if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); (void) snprintf(type, sizeof (type), "bswap %s %s", DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? "metadata" : "data", dmu_ot_byteswap[bswap].ob_name); } else { (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, sizeof (type)); } if (!BP_IS_EMBEDDED(bp)) { checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; } compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } SNPRINTF_BLKPTR(kmem_scnprintf, ' ', buf, buflen, bp, type, checksum, compress); } void spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } spa_config_exit(spa, SCL_ALL, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } void zfs_panic_recover(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); va_end(adx); } /* * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexadecimal numbers that don't overflow. */ uint64_t zfs_strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; int digit; while ((c = *str) != '\0') { if (c >= '0' && c <= '9') digit = c - '0'; else if (c >= 'a' && c <= 'f') digit = 10 + c - 'a'; else break; val *= 16; val += digit; str++; } if (nptr) *nptr = (char *)str; return (val); } void spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) { /* * We bump the feature refcount for each special vdev added to the pool */ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); } /* * ========================================================================== * Accessor functions * ========================================================================== */ boolean_t spa_shutting_down(spa_t *spa) { return (spa->spa_async_suspended); } dsl_pool_t * spa_get_dsl(spa_t *spa) { return (spa->spa_dsl_pool); } boolean_t spa_is_initializing(spa_t *spa) { return (spa->spa_is_initializing); } boolean_t spa_indirect_vdevs_loaded(spa_t *spa) { return (spa->spa_indirect_vdevs_loaded); } blkptr_t * spa_get_rootblkptr(spa_t *spa) { return (&spa->spa_ubsync.ub_rootbp); } void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) { spa->spa_uberblock.ub_rootbp = *bp; } void spa_altroot(spa_t *spa, char *buf, size_t buflen) { if (spa->spa_root == NULL) buf[0] = '\0'; else (void) strlcpy(buf, spa->spa_root, buflen); } uint32_t spa_sync_pass(spa_t *spa) { return (spa->spa_sync_pass); } char * spa_name(spa_t *spa) { return (spa->spa_name); } uint64_t spa_guid(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t guid; /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ if (spa->spa_root_vdev == NULL) return (spa->spa_config_guid); guid = spa->spa_last_synced_guid != 0 ? spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; /* * Return the most recently synced out guid unless we're * in syncing context. */ if (dp && dsl_pool_sync_context(dp)) return (spa->spa_root_vdev->vdev_guid); else return (guid); } uint64_t spa_load_guid(spa_t *spa) { /* * This is a GUID that exists solely as a reference for the * purposes of the arc. It is generated at load time, and * is never written to persistent storage. */ return (spa->spa_load_guid); } uint64_t spa_last_synced_txg(spa_t *spa) { return (spa->spa_ubsync.ub_txg); } uint64_t spa_first_txg(spa_t *spa) { return (spa->spa_first_txg); } uint64_t spa_syncing_txg(spa_t *spa) { return (spa->spa_syncing_txg); } /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. */ uint64_t spa_final_dirty_txg(spa_t *spa) { return (spa->spa_final_txg - TXG_DEFER_SIZE); } pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } spa_load_state_t spa_load_state(spa_t *spa) { return (spa->spa_load_state); } uint64_t spa_freeze_txg(spa_t *spa) { return (spa->spa_freeze_txg); } /* * Return the inflated asize for a logical write in bytes. This is used by the * DMU to calculate the space a logical write will require on disk. * If lsize is smaller than the largest physical block size allocatable on this * pool we use its value instead, since the write will end up using the whole * block anyway. */ uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) { if (lsize == 0) return (0); /* No inflation needed */ return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation); } /* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be * slightly larger than this. On very large pools, it will be capped to * the value of spa_max_slop. The embedded log space is not included in * spa_dspace. By subtracting it, the usable space (per "zfs list") is a * constant 97% of the total space, regardless of metaslab size (assuming the * default spa_slop_shift=5 and a non-tiny pool). * * See the comment above spa_slop_shift for more details. */ uint64_t spa_get_slop_space(spa_t *spa) { uint64_t space = 0; uint64_t slop = 0; /* * Make sure spa_dedup_dspace has been set. */ if (spa->spa_dedup_dspace == ~0ULL) spa_update_dspace(spa); space = spa->spa_rdspace; slop = MIN(space >> spa_slop_shift, spa_max_slop); /* * Subtract the embedded log space, but no more than half the (3.2%) * unusable space. Note, the "no more than half" is only relevant if * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by * default. */ uint64_t embedded_log = metaslab_class_get_dspace(spa_embedded_log_class(spa)); slop -= MIN(embedded_log, slop >> 1); /* * Slop space should be at least spa_min_slop, but no more than half * the entire pool. */ slop = MAX(slop, MIN(space >> 1, spa_min_slop)); return (slop); } uint64_t spa_get_dspace(spa_t *spa) { return (spa->spa_dspace); } uint64_t spa_get_checkpoint_space(spa_t *spa) { return (spa->spa_checkpoint_info.sci_dspace); } void spa_update_dspace(spa_t *spa) { spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa)); if (spa->spa_nonallocating_dspace > 0) { /* * Subtract the space provided by all non-allocating vdevs that * contribute to dspace. If a file is overwritten, its old * blocks are freed and new blocks are allocated. If there are * no snapshots of the file, the available space should remain * the same. The old blocks could be freed from the * non-allocating vdev, but the new blocks must be allocated on * other (allocating) vdevs. By reserving the entire size of * the non-allocating vdevs (including allocated space), we * ensure that there will be enough space on the allocating * vdevs for this file overwrite to succeed. * * Note that the DMU/DSL doesn't actually know or care * how much space is allocated (it does its own tracking * of how much space has been logically used). So it * doesn't matter that the data we are moving may be * allocated twice (on the old device and the new device). */ ASSERT3U(spa->spa_rdspace, >=, spa->spa_nonallocating_dspace); spa->spa_rdspace -= spa->spa_nonallocating_dspace; } spa->spa_dspace = spa->spa_rdspace + ddt_get_dedup_dspace(spa) + brt_get_dspace(spa); } /* * Return the failure mode that has been set to this pool. The default * behavior will be to block all I/Os when a complete failure occurs. */ uint64_t spa_get_failmode(spa_t *spa) { return (spa->spa_failmode); } boolean_t spa_suspended(spa_t *spa) { return (spa->spa_suspended != ZIO_SUSPEND_NONE); } uint64_t spa_version(spa_t *spa) { return (spa->spa_ubsync.ub_version); } boolean_t spa_deflate(spa_t *spa) { return (spa->spa_deflate); } metaslab_class_t * spa_normal_class(spa_t *spa) { return (spa->spa_normal_class); } metaslab_class_t * spa_log_class(spa_t *spa) { return (spa->spa_log_class); } metaslab_class_t * spa_embedded_log_class(spa_t *spa) { return (spa->spa_embedded_log_class); } metaslab_class_t * spa_special_class(spa_t *spa) { return (spa->spa_special_class); } metaslab_class_t * spa_dedup_class(spa_t *spa) { return (spa->spa_dedup_class); } boolean_t spa_special_has_ddt(spa_t *spa) { return (zfs_ddt_data_is_special && spa->spa_special_class->mc_groups != 0); } /* * Locate an appropriate allocation class */ metaslab_class_t * spa_preferred_class(spa_t *spa, const zio_t *zio) { const zio_prop_t *zp = &zio->io_prop; /* * Override object type for the purposes of selecting a storage class. * Primarily for DMU_OTN_ types where we can't explicitly control their * storage class; instead, choose a static type most closely matches * what we want. */ dmu_object_type_t objtype = zp->zp_storage_type == DMU_OT_NONE ? zp->zp_type : zp->zp_storage_type; /* * ZIL allocations determine their class in zio_alloc_zil(). */ ASSERT(objtype != DMU_OT_INTENT_LOG); boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; if (DMU_OT_IS_DDT(objtype)) { if (spa->spa_dedup_class->mc_groups != 0) return (spa_dedup_class(spa)); else if (has_special_class && zfs_ddt_data_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* Indirect blocks for user data can land in special if allowed */ if (zp->zp_level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { if (has_special_class && zfs_user_indirect_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } if (DMU_OT_IS_METADATA(objtype) || zp->zp_level > 0) { if (has_special_class) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* * Allow small file blocks in special class in some cases (like * for the dRAID vdev feature). But always leave a reserve of * zfs_special_class_metadata_reserve_pct exclusively for metadata. */ if (DMU_OT_IS_FILE(objtype) && has_special_class && zio->io_size <= zp->zp_zpl_smallblk) { metaslab_class_t *special = spa_special_class(spa); uint64_t alloc = metaslab_class_get_alloc(special); uint64_t space = metaslab_class_get_space(special); uint64_t limit = (space * (100 - zfs_special_class_metadata_reserve_pct)) / 100; if (alloc < limit) return (special); } return (spa_normal_class(spa)); } void spa_evicting_os_register(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_insert_head(&spa->spa_evicting_os_list, os); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_deregister(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_remove(&spa->spa_evicting_os_list, os); cv_broadcast(&spa->spa_evicting_os_cv); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_wait(spa_t *spa) { mutex_enter(&spa->spa_evicting_os_lock); while (!list_is_empty(&spa->spa_evicting_os_list)) cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); mutex_exit(&spa->spa_evicting_os_lock); dmu_buf_user_evict_wait(); } int spa_max_replication(spa_t *spa) { /* * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to * handle BPs with more than one DVA allocated. Set our max * replication level accordingly. */ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) return (1); return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } int spa_prev_software_version(spa_t *spa) { return (spa->spa_prev_software_version); } uint64_t spa_deadman_synctime(spa_t *spa) { return (spa->spa_deadman_synctime); } spa_autotrim_t spa_get_autotrim(spa_t *spa) { return (spa->spa_autotrim); } uint64_t spa_deadman_ziotime(spa_t *spa) { return (spa->spa_deadman_ziotime); } uint64_t spa_get_deadman_failmode(spa_t *spa) { return (spa->spa_deadman_failmode); } void spa_set_deadman_failmode(spa_t *spa, const char *failmode) { if (strcmp(failmode, "wait") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; else if (strcmp(failmode, "continue") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE; else if (strcmp(failmode, "panic") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; else spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; } void spa_set_deadman_ziotime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_ziotime = ns; mutex_exit(&spa_namespace_lock); } } void spa_set_deadman_synctime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_synctime = ns; mutex_exit(&spa_namespace_lock); } } uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); uint64_t dsize = asize; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd != NULL) dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } return (dsize); } uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); } uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); return (dsize); } uint64_t spa_dirty_data(spa_t *spa) { return (spa->spa_dsl_pool->dp_dirty_total); } /* * ========================================================================== * SPA Import Progress Routines * ========================================================================== */ typedef struct spa_import_progress { uint64_t pool_guid; /* unique id for updates */ char *pool_name; spa_load_state_t spa_load_state; char *spa_load_notes; uint64_t mmp_sec_remaining; /* MMP activity check */ uint64_t spa_load_max_txg; /* rewind txg */ procfs_list_node_t smh_node; } spa_import_progress_t; spa_history_list_t *spa_import_progress_list = NULL; static int spa_import_progress_show_header(struct seq_file *f) { seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid", "load_state", "multihost_secs", "max_txg", "pool_name", "notes"); return (0); } static int spa_import_progress_show(struct seq_file *f, void *data) { spa_import_progress_t *sip = (spa_import_progress_t *)data; seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n", (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state, (u_longlong_t)sip->mmp_sec_remaining, (u_longlong_t)sip->spa_load_max_txg, (sip->pool_name ? sip->pool_name : "-"), (sip->spa_load_notes ? sip->spa_load_notes : "-")); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size) { spa_import_progress_t *sip; while (shl->size > size) { sip = list_remove_head(&shl->procfs_list.pl_list); if (sip->pool_name) spa_strfree(sip->pool_name); if (sip->spa_load_notes) kmem_strfree(sip->spa_load_notes); kmem_free(sip, sizeof (spa_import_progress_t)); shl->size--; } IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list)); } static void spa_import_progress_init(void) { spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t), KM_SLEEP); spa_import_progress_list->size = 0; spa_import_progress_list->procfs_list.pl_private = spa_import_progress_list; procfs_list_install("zfs", NULL, "import_progress", 0644, &spa_import_progress_list->procfs_list, spa_import_progress_show, spa_import_progress_show_header, NULL, offsetof(spa_import_progress_t, smh_node)); } static void spa_import_progress_destroy(void) { spa_history_list_t *shl = spa_import_progress_list; procfs_list_uninstall(&shl->procfs_list); spa_import_progress_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); kmem_free(shl, sizeof (spa_history_list_t)); } int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t load_state) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_state = load_state; if (sip->spa_load_notes != NULL) { kmem_strfree(sip->spa_load_notes); sip->spa_load_notes = NULL; } error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } static void spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg, const char *fmt, va_list adx) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; uint64_t pool_guid = spa_guid(spa); if (shl->size == 0) return; char *notes = kmem_vasprintf(fmt, adx); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { if (sip->spa_load_notes != NULL) { kmem_strfree(sip->spa_load_notes); sip->spa_load_notes = NULL; } sip->spa_load_notes = notes; if (log_dbgmsg) zfs_dbgmsg("'%s' %s", sip->pool_name, notes); notes = NULL; break; } } mutex_exit(&shl->procfs_list.pl_lock); if (notes != NULL) kmem_strfree(notes); } void spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...) { va_list adx; va_start(adx, fmt); spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx); va_end(adx); } void spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...) { va_list adx; va_start(adx, fmt); spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx); va_end(adx); } int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_max_txg = load_max_txg; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } int spa_import_progress_set_mmp_check(uint64_t pool_guid, uint64_t mmp_sec_remaining) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->mmp_sec_remaining = mmp_sec_remaining; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * A new import is in progress, add an entry. */ void spa_import_progress_add(spa_t *spa) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; const char *poolname = NULL; sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP); sip->pool_guid = spa_guid(spa); (void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &poolname); if (poolname == NULL) poolname = spa_name(spa); sip->pool_name = spa_strdup(poolname); sip->spa_load_state = spa_load_state(spa); sip->spa_load_notes = NULL; mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sip); shl->size++; mutex_exit(&shl->procfs_list.pl_lock); } void spa_import_progress_remove(uint64_t pool_guid) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { if (sip->pool_name) spa_strfree(sip->pool_name); if (sip->spa_load_notes) spa_strfree(sip->spa_load_notes); list_remove(&shl->procfs_list.pl_list, sip); shl->size--; kmem_free(sip, sizeof (spa_import_progress_t)); break; } } mutex_exit(&shl->procfs_list.pl_lock); } /* * ========================================================================== * Initialization and Termination * ========================================================================== */ static int spa_name_compare(const void *a1, const void *a2) { const spa_t *s1 = a1; const spa_t *s2 = a2; int s; s = strcmp(s1->spa_name, s2->spa_name); return (TREE_ISIGN(s)); } void spa_boot_init(void) { spa_config_load(); } void spa_init(spa_mode_t mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); spa_mode_global = mode; #ifndef _KERNEL if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) { struct sigaction sa; sa.sa_flags = SA_SIGINFO; sigemptyset(&sa.sa_mask); sa.sa_sigaction = arc_buf_sigsegv; if (sigaction(SIGSEGV, &sa, NULL) == -1) { perror("could not enable watchpoints: " "sigaction(SIGSEGV, ...) = "); } else { arc_watch = B_TRUE; } } #endif fm_init(); zfs_refcount_init(); unique_init(); zfs_btree_init(); metaslab_stat_init(); brt_init(); ddt_init(); zio_init(); dmu_init(); zil_init(); vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); zfs_prop_init(); chksum_init(); zpool_prop_init(); zpool_feature_init(); spa_config_load(); vdev_prop_init(); l2arc_start(); scan_init(); qat_init(); spa_import_progress_init(); zap_init(); } void spa_fini(void) { l2arc_stop(); spa_evict_all(); vdev_file_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); chksum_fini(); zil_fini(); dmu_fini(); zio_fini(); ddt_fini(); brt_fini(); metaslab_stat_fini(); zfs_btree_fini(); unique_fini(); zfs_refcount_fini(); fm_fini(); scan_fini(); qat_fini(); spa_import_progress_destroy(); zap_fini(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); mutex_destroy(&spa_l2cache_lock); } /* * Return whether this pool has a dedicated slog device. No locking needed. * It's not a problem if the wrong answer is returned as it's only for * performance and not correctness. */ boolean_t spa_has_slogs(spa_t *spa) { return (spa->spa_log_class->mc_groups != 0); } spa_log_state_t spa_get_log_state(spa_t *spa) { return (spa->spa_log_state); } void spa_set_log_state(spa_t *spa, spa_log_state_t state) { spa->spa_log_state = state; } boolean_t spa_is_root(spa_t *spa) { return (spa->spa_is_root); } boolean_t spa_writeable(spa_t *spa) { return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config); } /* * Returns true if there is a pending sync task in any of the current * syncing txg, the current quiescing txg, or the current open txg. */ boolean_t spa_has_pending_synctask(spa_t *spa) { return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); } spa_mode_t spa_mode(spa_t *spa) { return (spa->spa_mode); } uint64_t spa_get_last_scrubbed_txg(spa_t *spa) { return (spa->spa_scrubbed_last_txg); } uint64_t spa_bootfs(spa_t *spa) { return (spa->spa_bootfs); } uint64_t spa_delegation(spa_t *spa) { return (spa->spa_delegation); } objset_t * spa_meta_objset(spa_t *spa) { return (spa->spa_meta_objset); } enum zio_checksum spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } /* * Reset pool scan stat per scan pass (or reboot). */ void spa_scan_stat_init(spa_t *spa) { /* data not stored on disk */ spa->spa_scan_pass_start = gethrestime_sec(); if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan)) spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_scrub_pause = 0; if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_errorscrub_pause = 0; spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; // error scrub stats spa->spa_scan_pass_errorscrub_spent_paused = 0; } /* * Get scan stats for zpool status reports */ int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE && scn->errorscrub_phys.dep_func == POOL_SCAN_NONE)) return (SET_ERROR(ENOENT)); memset(ps, 0, sizeof (pool_scan_stat_t)); /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; ps->pss_state = scn->scn_phys.scn_state; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_skipped = scn->scn_phys.scn_skipped; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; /* data not stored on disk */ ps->pss_pass_exam = spa->spa_scan_pass_exam; ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; ps->pss_pass_issued = spa->spa_scan_pass_issued; ps->pss_issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; /* error scrub data stored on disk */ ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func; ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state; ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time; ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time; ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined; ps->pss_error_scrub_to_be_examined = scn->errorscrub_phys.dep_to_examine; /* error scrub data not stored on disk */ ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause; return (0); } int spa_maxblocksize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SPA_MAXBLOCKSIZE); else return (SPA_OLD_MAXBLOCKSIZE); } /* * Returns the txg that the last device removal completed. No indirect mappings * have been added since this txg. */ uint64_t spa_get_last_removal_txg(spa_t *spa) { uint64_t vdevid; uint64_t ret = -1ULL; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* * sr_prev_indirect_vdev is only modified while holding all the * config locks, so it is sufficient to hold SCL_VDEV as reader when * examining it. */ vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; while (vdevid != -1ULL) { vdev_t *vd = vdev_lookup_top(spa, vdevid); vdev_indirect_births_t *vib = vd->vdev_indirect_births; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); /* * If the removal did not remap any data, we don't care. */ if (vdev_indirect_births_count(vib) != 0) { ret = vdev_indirect_births_last_entry_txg(vib); break; } vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; } spa_config_exit(spa, SCL_VDEV, FTAG); IMPLY(ret != -1ULL, spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); return (ret); } int spa_maxdnodesize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (DNODE_MAX_SIZE); else return (DNODE_MIN_SIZE); } boolean_t spa_multihost(spa_t *spa) { return (spa->spa_multihost ? B_TRUE : B_FALSE); } uint32_t spa_get_hostid(spa_t *spa) { return (spa->spa_hostid); } boolean_t spa_trust_config(spa_t *spa) { return (spa->spa_trust_config); } uint64_t spa_missing_tvds_allowed(spa_t *spa) { return (spa->spa_missing_tvds_allowed); } space_map_t * spa_syncing_log_sm(spa_t *spa) { return (spa->spa_syncing_log_sm); } void spa_set_missing_tvds(spa_t *spa, uint64_t missing) { spa->spa_missing_tvds = missing; } /* * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc). */ const char * spa_state_to_name(spa_t *spa) { ASSERT3P(spa, !=, NULL); /* * it is possible for the spa to exist, without root vdev * as the spa transitions during import/export */ vdev_t *rvd = spa->spa_root_vdev; if (rvd == NULL) { return ("TRANSITIONING"); } vdev_state_t state = rvd->vdev_state; vdev_aux_t aux = rvd->vdev_stat.vs_aux; if (spa_suspended(spa)) return ("SUSPENDED"); switch (state) { case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: return ("OFFLINE"); case VDEV_STATE_REMOVED: return ("REMOVED"); case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return ("FAULTED"); else if (aux == VDEV_AUX_SPLIT_POOL) return ("SPLIT"); else return ("UNAVAIL"); case VDEV_STATE_FAULTED: return ("FAULTED"); case VDEV_STATE_DEGRADED: return ("DEGRADED"); case VDEV_STATE_HEALTHY: return ("ONLINE"); default: break; } return ("UNKNOWN"); } boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t spa_has_checkpoint(spa_t *spa) { return (spa->spa_checkpoint_txg != 0); } boolean_t spa_importing_readonly_checkpoint(spa_t *spa) { return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && spa->spa_mode == SPA_MODE_READ); } uint64_t spa_min_claim_txg(spa_t *spa) { uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; if (checkpoint_txg != 0) return (checkpoint_txg + 1); return (spa->spa_first_txg); } /* * If there is a checkpoint, async destroys may consume more space from * the pool instead of freeing it. In an attempt to save the pool from * getting suspended when it is about to run out of space, we stop * processing async destroys. */ boolean_t spa_suspend_async_destroy(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t unreserved = dsl_pool_unreserved_space(dp, ZFS_SPACE_CHECK_EXTRA_RESERVED); uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; if (spa_has_checkpoint(spa) && avail == 0) return (B_TRUE); return (B_FALSE); } #if defined(_KERNEL) int param_set_deadman_failmode_common(const char *val) { spa_t *spa = NULL; char *p; if (val == NULL) return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 && strcmp(val, "panic")) return (SET_ERROR(EINVAL)); if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa_set_deadman_failmode(spa, val); mutex_exit(&spa_namespace_lock); } return (0); } #endif /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); EXPORT_SYMBOL(spa_close); EXPORT_SYMBOL(spa_refcount_zero); /* Pool configuration lock */ EXPORT_SYMBOL(spa_config_tryenter); EXPORT_SYMBOL(spa_config_enter); EXPORT_SYMBOL(spa_config_exit); EXPORT_SYMBOL(spa_config_held); /* Pool vdev add/remove lock */ EXPORT_SYMBOL(spa_vdev_enter); EXPORT_SYMBOL(spa_vdev_exit); /* Pool vdev state change lock */ EXPORT_SYMBOL(spa_vdev_state_enter); EXPORT_SYMBOL(spa_vdev_state_exit); /* Accessor functions */ EXPORT_SYMBOL(spa_shutting_down); EXPORT_SYMBOL(spa_get_dsl); EXPORT_SYMBOL(spa_get_rootblkptr); EXPORT_SYMBOL(spa_set_rootblkptr); EXPORT_SYMBOL(spa_altroot); EXPORT_SYMBOL(spa_sync_pass); EXPORT_SYMBOL(spa_name); EXPORT_SYMBOL(spa_guid); EXPORT_SYMBOL(spa_last_synced_txg); EXPORT_SYMBOL(spa_first_txg); EXPORT_SYMBOL(spa_syncing_txg); EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); EXPORT_SYMBOL(spa_normal_class); EXPORT_SYMBOL(spa_log_class); EXPORT_SYMBOL(spa_special_class); EXPORT_SYMBOL(spa_preferred_class); EXPORT_SYMBOL(spa_max_replication); EXPORT_SYMBOL(spa_prev_software_version); EXPORT_SYMBOL(spa_get_failmode); EXPORT_SYMBOL(spa_suspended); EXPORT_SYMBOL(spa_bootfs); EXPORT_SYMBOL(spa_delegation); EXPORT_SYMBOL(spa_meta_objset); EXPORT_SYMBOL(spa_maxblocksize); EXPORT_SYMBOL(spa_maxdnodesize); /* Miscellaneous support routines */ EXPORT_SYMBOL(spa_guid_exists); EXPORT_SYMBOL(spa_strdup); EXPORT_SYMBOL(spa_strfree); EXPORT_SYMBOL(spa_generate_guid); EXPORT_SYMBOL(snprintf_blkptr); EXPORT_SYMBOL(spa_freeze); EXPORT_SYMBOL(spa_upgrade); EXPORT_SYMBOL(spa_evict_all); EXPORT_SYMBOL(spa_lookup_by_guid); EXPORT_SYMBOL(spa_has_spare); EXPORT_SYMBOL(dva_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize); EXPORT_SYMBOL(spa_has_slogs); EXPORT_SYMBOL(spa_is_root); EXPORT_SYMBOL(spa_writeable); EXPORT_SYMBOL(spa_mode); EXPORT_SYMBOL(spa_namespace_lock); EXPORT_SYMBOL(spa_trust_config); EXPORT_SYMBOL(spa_missing_tvds_allowed); EXPORT_SYMBOL(spa_set_missing_tvds); EXPORT_SYMBOL(spa_state_to_name); EXPORT_SYMBOL(spa_importing_readonly_checkpoint); EXPORT_SYMBOL(spa_min_claim_txg); EXPORT_SYMBOL(spa_suspend_async_destroy); EXPORT_SYMBOL(spa_has_checkpoint); EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable); ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW, "Set additional debugging flags"); ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW, "Set to attempt to recover from fatal errors"); ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW, "Set to ignore IO errors during free and permanently leak the space"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, U64, ZMOD_RW, "Dead I/O check interval in milliseconds"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW, "Enable deadman timer"); ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, UINT, ZMOD_RW, "SPA size estimate multiplication factor"); ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, "Place DDT data into the special class"); ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW, "Place user data indirect blocks into the special class"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, param_set_deadman_failmode, param_get_charp, ZMOD_RW, "Failmode for deadman timer"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms, param_set_deadman_synctime, spl_param_get_u64, ZMOD_RW, "Pool sync expiration time in milliseconds"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms, param_set_deadman_ziotime, spl_param_get_u64, ZMOD_RW, "IO expiration time in milliseconds"); ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW, "Small file blocks in special vdevs depends on this much " "free space available"); /* END CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_uint, ZMOD_RW, "Reserved free space in pool"); ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, "Number of allocators per spa"); ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW, "Minimum number of CPUs per allocators"); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 983f444d79b0..1a930f1e4f1a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1,6585 +1,6585 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, Klara Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" /* * One metaslab from each (normal-class) vdev is used by the ZIL. These are * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are * part of the spa_embedded_log_class. The metaslab with the most free space * in each vdev is selected for this purpose when the pool is opened (or a * vdev is added). See vdev_metaslab_init(). * * Log blocks can be allocated from the following locations. Each one is tried * in order until the allocation succeeds: * 1. dedicated log vdevs, aka "slog" (spa_log_class) * 2. embedded slog metaslabs (spa_embedded_log_class) * 3. other metaslabs in normal vdevs (spa_normal_class) * * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer * than this number of metaslabs in the vdev. This ensures that we don't set * aside an unreasonable amount of space for the ZIL. If set to less than * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced * (by more than 1<vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %llu: %s", indent, "", (u_longlong_t)vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Virtual device management. */ static vdev_ops_t *const vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_draid_ops, &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, *const *opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { if (mc == spa_embedded_log_class(vd->vdev_spa) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else return (vd->vdev_mg); } void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { (void) vd, (void) remain_rs; physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } return (asize); } uint64_t vdev_default_min_asize(vdev_t *vd) { return (vd->vdev_min_asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, uint64_t)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } /* * Get the minimal allocation size for the top-level vdev. */ uint64_t vdev_get_min_alloc(vdev_t *vd) { uint64_t min_alloc = 1ULL << vd->vdev_ashift; if (vd->vdev_ops->vdev_op_min_alloc != NULL) min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); return (min_alloc); } /* * Get the parity level for a top-level vdev. */ uint64_t vdev_get_nparity(vdev_t *vd) { uint64_t nparity = 0; if (vd->vdev_ops->vdev_op_nparity != NULL) nparity = vd->vdev_ops->vdev_op_nparity(vd); return (nparity); } static int vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t objid; int err; if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (EINVAL); } err = zap_lookup(mos, objid, vdev_prop_to_name(prop), sizeof (uint64_t), 1, value); if (err == ENOENT) *value = vdev_prop_default_numeric(prop); return (err); } /* * Get the number of data disks for a top-level vdev. */ uint64_t vdev_get_ndisks(vdev_t *vd) { uint64_t ndisks = 1; if (vd->vdev_ops->vdev_op_ndisks != NULL) ndisks = vd->vdev_ops->vdev_op_ndisks(vd); return (ndisks); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { memcpy(newchild, pvd->vdev_child, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; - spa->spa_load_guid = spa_generate_guid(NULL); + spa->spa_load_guid = spa_generate_load_guid(); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_dio_verify_rl, &zfs_dio_write_verify_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); /* * Default Thresholds for tuning ZED */ vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N); vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; const char *type; uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; const char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { const char *bias; /* * If creating a top-level vdev, check for allocation * classes input. */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } /* spa_vdev_add() expects feature to be enabled */ if (ops == &vdev_draid_ops && spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } } /* * Initialize the vdev specific data. This is done before calling * vdev_alloc_common() since it may fail and this simplifies the * error reporting and cleanup code paths. */ void *tsd = NULL; if (ops->vdev_op_init != NULL) { rc = ops->vdev_op_init(spa, nv, &tsd); if (rc != 0) { return (rc); } } vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_tsd = tsd; vd->vdev_islog = islog; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0) vd->vdev_path = spa_strdup(tmp); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0) vd->vdev_devid = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0) vd->vdev_physpath = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &tmp) == 0) vd->vdev_enc_sysfs_path = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0) vd->vdev_fru = spa_strdup(tmp); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; vic = &vd->vdev_indirect_config; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. Ignore pool ashift for vdev * attach case. */ if (alloctype != VDEV_ALLOC_ATTACH) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); } else { vd->vdev_attaching = B_TRUE; } /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); if (vd->vdev_ops == &vdev_root_ops && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, &vd->vdev_root_zap); } /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, &vd->vdev_noalloc); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); vd->vdev_rz_expanding = nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, &vd->vdev_rebuild_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { const char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); if (vd->vdev_ops->vdev_op_fini != NULL) vd->vdev_ops->vdev_op_fini(vd); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_autotrim_kick_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); cv_destroy(&vd->vdev_rebuild_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); zfs_ratelimit_fini(&vd->vdev_dio_verify_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); if (tvd->vdev_log_mg) ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; if (tvd->vdev_log_mg != NULL) tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_noalloc); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); tvd->vdev_noalloc = svd->vdev_noalloc; tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_noalloc = 0; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. There is no need to * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } /* * Choose GCD for spa_gcd_alloc. */ static uint64_t vdev_gcd(uint64_t a, uint64_t b) { while (b != 0) { uint64_t t = b; b = a % b; a = t; } return (a); } /* * Set spa_min_alloc and spa_gcd_alloc. */ static void vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; if (spa->spa_gcd_alloc == INT_MAX) { spa->spa_gcd_alloc = min_alloc; } else { spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } } void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); if (!vd->vdev_islog) { vd->vdev_log_mg = metaslab_group_create( spa_embedded_log_class(spa), vd, 1); } /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundary * setting had to wait until now. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); vdev_spa_set_alloc(spa, min_alloc); } } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (uint64_t m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab * group. */ if (vd->vdev_mg->mg_class == spa_normal_class(spa) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; uint64_t smallest = UINT64_MAX; /* * Note, we only search the new metaslabs, because the old * (pre-existing) ones may be active (e.g. have non-empty * range_tree's), and we don't move them to the new * metaslab_t. */ for (uint64_t m = oldc; m < newc; m++) { uint64_t alloc = space_map_allocated(vd->vdev_ms[m]->ms_sm); if (alloc < smallest) { slog_msid = m; smallest = alloc; } } metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; /* * The metaslab was marked as dirty at the end of * metaslab_init(). Remove it from the dirty list so that we * can uninitialize and reinitialize it to the new class. */ if (txg != 0) { (void) txg_list_remove_this(&vd->vdev_ms_list, slog_ms, txg); } uint64_t sm_obj = space_map_object(slog_ms->ms_sm); metaslab_fini(slog_ms); VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, &vd->vdev_ms[slog_msid])); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is marked as non-allocating then don't * activate the metaslabs since we want to ensure that * no allocations are performed on this device. */ if (vd->vdev_noalloc) { /* track non-allocating vdev space */ spa->spa_nonallocating_dspace += spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; } else if (!expanding) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); if (vd->vdev_log_mg != NULL) { ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); if (vd->vdev_log_mg != NULL) ASSERT0(vd->vdev_log_mg->mg_histogram[i]); } } ASSERT0(vd->vdev_ms_count); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); /* * If this probe was initiated from zio pipeline, then * change the state in a spa_async_request. Probes that * were initiated from a vdev_open can change the state * as part of the open call. */ if (vps->vps_zio_done_probe) { vd->vdev_fault_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); } } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_load_child(void *arg) { vdev_t *vd = arg; vd->vdev_load_error = vdev_load(vd); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Returns B_TRUE if the passed child should be opened. */ static boolean_t vdev_default_open_children_func(vdev_t *vd) { (void) vd; return (B_TRUE); } /* * Open the requested child vdevs. If any of the leaf vdevs are using * a ZFS volume then do the opens in a single thread. This avoids a * deadlock when the current thread is holding the spa_namespace_lock. */ static void vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { int children = vd->vdev_children; taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (open_func(cvd) == B_FALSE) continue; if (tq == NULL || vdev_uses_zvols(vd)) { cvd->vdev_open_error = vdev_open(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_open_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } vd->vdev_nonrot &= cvd->vdev_nonrot; } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } } /* * Open all child vdevs. */ void vdev_open_children(vdev_t *vd) { vdev_open_children_impl(vd, vdev_default_open_children_func); } /* * Conditionally open a subset of child vdevs. */ void vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) { vdev_open_children_impl(vd, open_func); } /* * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE * changed, this algorithm can not change, otherwise it would inconsistently * account for existing bp's. We also hard-code txg 0 for the same reason * since expanded RAIDZ vdevs can use a different asize for different birth * txg's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> SPA_MINBLOCKSHIFT); } } /* * Choose the best of two ashifts, preferring one between logical ashift * (absolute minimum) and administrator defined maximum, otherwise take * the biggest of the two. */ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) { if (a > logical && a <= zfs_vdev_max_auto_ashift) { if (b <= logical || b > zfs_vdev_max_auto_ashift) return (a); else return (MAX(a, b)); } else if (b <= logical || b > zfs_vdev_max_auto_ashift) return (MAX(a, b)); return (b); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ static void vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); if (vd->vdev_ashift < vd->vdev_physical_ashift && vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * If the logical and physical ashifts are the same, then * we ensure that the top-level vdev's ashift is not smaller * than our minimum ashift value. For the unusual case * where logical ashift > physical ashift, we can't cap * the calculated ashift based on max ashift as that * would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, vd->vdev_ashift); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_fault_wanted = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* Keep the device in removed state if unplugged */ if (error == ENOENT && vd->vdev_removed) { vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); return (error); } /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy * or damaged: either way it's not safe for use, bail out of the open. */ if (osize > max_osize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_OPEN_FAILED); return (SET_ERROR(ENXIO)); } /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t); max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } /* * We can always set the logical/physical ashift members since * their values are only used to calculate the vdev_ashift when * the device is first added to the config. These values should * not be used for anything else since they may change whenever * the device is reopened and we don't store them in the label. */ vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; /* * If the vdev_ashift was not overridden at creation time * (0) or the override value is impossible for the device, * then set it the logical ashift and optimize the ashift. */ if (vd->vdev_ashift < vd->vdev_logical_ashift) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (SET_ERROR(EDOM)); } if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE) vdev_ashift_optimize(vd); vd->vdev_attaching = B_FALSE; } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Make sure the alignment required hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the minimum allocation size. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); vdev_spa_set_alloc(spa, min_alloc); } /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } static void vdev_validate_child(void *arg) { vdev_t *vd = arg; vd->vdev_validate_thread = curthread; vd->vdev_validate_error = vdev_validate(vd); vd->vdev_validate_thread = NULL; } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; int children = vd->vdev_children; if (vdev_validate_skip) return (0); if (children > 0) { tq = taskq_create("vdev_validate", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } for (uint64_t c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { vdev_validate_child(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < children; c++) { int error = vd->vdev_child[c]->vdev_validate_error; if (error != 0) return (SET_ERROR(EBADF)); } /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid) { if (svd != NULL && *dvd != NULL) { if (strcmp(svd, *dvd) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed " "from '%s' to '%s'", (u_longlong_t)guid, prefix, *dvd, svd); spa_strfree(*dvd); *dvd = spa_strdup(svd); } } else if (svd != NULL) { *dvd = spa_strdup(svd); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)guid, *dvd); } } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { char *old, *new; vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path, dvd->vdev_guid); vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid, dvd->vdev_guid); vdev_update_path("vdev_physpath", svd->vdev_physpath, &dvd->vdev_physpath, dvd->vdev_guid); /* * Our enclosure sysfs path may have changed between imports */ old = dvd->vdev_enc_sysfs_path; new = svd->vdev_enc_sysfs_path; if ((old != NULL && new == NULL) || (old == NULL && new != NULL) || ((old != NULL && new != NULL) && strcmp(new, old) != 0)) { zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path " "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, old, new); if (dvd->vdev_enc_sysfs_path) spa_strfree(dvd->vdev_enc_sysfs_path); if (svd->vdev_enc_sysfs_path) { dvd->vdev_enc_sysfs_path = spa_strdup( svd->vdev_enc_sysfs_path); } else { dvd->vdev_enc_sysfs_path = NULL; } } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; ASSERT(vd != NULL); ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. */ if (l2arc_vdev_present(vd)) { l2arc_rebuild_vdev(vd, B_TRUE); } else { l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); } /* * Recheck if resilver is still needed and cancel any * scheduled resilver if resilver is unneeded. */ if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && spa->spa_async_tasks & SPA_ASYNC_RESILVER) { mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; mutex_exit(&spa->spa_async_lock); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : SET_ERROR(ENXIO)); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * This isn't a problem but it can result in devices being tried * which are known to not have the data. In which case, the import * is relying on the checksum to ensure that we get the right data. * Note that while importing we are only reading the MOS, which is * always checksummed. */ mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. */ boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { (void) dva, (void) psize; /* Set by sequential resilver. */ if (phys_birth == TXG_UNKNOWN) return (B_TRUE); return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); } /* * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, phys_birth)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; /* Rebuild not initiated by attach */ if (vd->vdev_rebuild_txg == 0) return (B_TRUE); /* * When a rebuild completes without error then all missing data * up to the rebuild max txg has been reconstructed and the DTL * is eligible for excision. */ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && vdev_dtl_max(vd) <= vrp->vrp_max_txg) { ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); return (B_TRUE); } } else { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; /* Resilver not initiated by attach */ if (vd->vdev_resilver_txg == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the * scn_max_txg value to the highest txg value that exists * in all DTLs. If this device's max DTL is not part of this * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] * then it is not eligible for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); return (B_TRUE); } } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ static void vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess_impl(vd->vdev_child[c], txg, scrub_txg, scrub_done, rebuild_done, faulting); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan or rebuild completed cleanly. */ if (zfs_scan_ignore_errors) { if (scn != NULL) scn->scn_phys.scn_errors = 0; if (vr != NULL) vr->vr_rebuild_phys.vrp_errors = 0; } if (scrub_txg != 0 && !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, (u_longlong_t)scrub_txg, spa->spa_scrub_started, (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd), (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); } /* * If we've completed a scrub/resilver or a rebuild cleanly * then determine if this vdev should remove any DTLs. We * only want to excise regions on vdevs that were available * during the entire duration of this scan. */ if (rebuild_done && vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { check_excise = B_TRUE; } else { if (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) { check_excise = B_TRUE; } } if (scrub_txg && check_excise && vdev_dtl_should_excise(vd, rebuild_done)) { /* * We completed a scrub, resilver or rebuild up to * scrub_txg. If we did it without rebooting, then * the scrub dtl will be valid, so excise the old * region and fold in the scrub dtl. Otherwise, * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); } else if (!wasempty) { zfs_dbgmsg("DTL_MISSING is now empty"); } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); /* * For the faulting case, treat members of a replacing vdev * as if they are not available. It's more likely than not that * a vdev in a replacing vdev could encounter read errors so * treat it as not being able to contribute. */ if (!vdev_readable(vd) || (faulting && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) { range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); } else { range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); } /* * If the vdev was resilvering or rebuilding and no longer * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); } else if (vd->vdev_resilver_txg != 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); } else { mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) { /* leaf vdevs only */ continue; } if (t == DTL_PARTIAL) { /* i.e. non-zero */ minref = 1; } else if (vdev_get_nparity(vd) != 0) { /* RAIDZ, DRAID */ minref = vdev_get_nparity(vd) + 1; } else { /* any kind of mirror */ minref = vd->vdev_children; } space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { raidz_dtl_reassessed(vd); } } void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done) { return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done, rebuild_done, B_FALSE)); } /* * Iterate over all the vdevs except spare, and post kobj events */ void vdev_post_kobj_evt(vdev_t *vd) { if (vd->vdev_ops->vdev_op_kobj_evt_post && vd->vdev_kobj_flag == B_FALSE) { vd->vdev_kobj_flag = B_TRUE; vd->vdev_ops->vdev_op_kobj_evt_post(vd); } for (int c = 0; c < vd->vdev_children; c++) vdev_post_kobj_evt(vd->vdev_child[c]); } /* * Iterate over all the vdevs except spare, and clear kobj events */ void vdev_clear_kobj_evt(vdev_t *vd) { vd->vdev_kobj_flag = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) vdev_clear_kobj_evt(vd->vdev_child[c]); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); /* * If the dtl cannot be sync'd there is no need to open it. */ if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps) return (0); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, vd->vdev_dtl[DTL_MISSING]); mutex_exit(&vd->vdev_dtl_lock); } range_tree_vacate(rt, NULL, NULL); range_tree_destroy(rt); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 && spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx); vd->vdev_root_zap = vdev_create_link_zap(vd, tx); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); } /* * Determine whether the specified vdev can be * - offlined * - detached * - removed * - faulted * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, SET_ERROR(ECHILD)); } return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int children = vd->vdev_children; int error = 0; taskq_t *tq = NULL; /* * It's only worthwhile to use the taskq for the root vdev, because the * slow part is metaslab_init, and that only happens for top-level * vdevs. */ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { tq = taskq_create("vdev_load", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { cvd->vdev_load_error = vdev_load(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_load_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < vd->vdev_children; c++) { int error = vd->vdev_child[c]->vdev_load_error; if (error != 0) return (error); } vdev_set_deflate_ratio(vd); if (vd->vdev_ops == &vdev_raidz_ops) { error = vdev_raidz_load(vd); if (error != 0) return (error); } /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); return (error); } } if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; uint64_t failfast; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast), 1, &failfast); if (error == 0) { vd->vdev_failfast = failfast & 1; } else if (error == ENOENT) { vd->vdev_failfast = vdev_prop_default_numeric( VDEV_PROP_FAILFAST); } else { vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); } } /* * Load any rebuild state from the top-level vdev zap. */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { error = vdev_rebuild_load(vd); if (error && error != ENOTSUP) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " "failed [error=%d]", error); return (error); } } if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) { uint64_t zapobj; if (vd->vdev_top_zap != 0) zapobj = vd->vdev_top_zap; else zapobj = vd->vdev_leaf_zap; error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N, &vd->vdev_checksum_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T, &vd->vdev_checksum_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_N, &vd->vdev_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_T, &vd->vdev_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N, &vd->vdev_slow_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T, &vd->vdev_slow_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } error = vdev_metaslab_init(vd, 0); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since the checkpoint_sm contains free entries * exclusively we can use space_map_allocated() to * indicate the cumulative checkpointed space that * has been freed. */ vd->vdev_stat.vs_checkpoint_space = -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } static void vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(vd->vdev_spa); if (vd->vdev_top_zap == 0) return; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) return; VERIFY0(err); VERIFY0(dmu_object_free(mos, object, tx)); VERIFY0(zap_remove(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) { metaslab_sync_reassess(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_sync_reassess(vd->vdev_log_mg); } } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); vdev_indirect_sync_obsolete(vd, tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } /* * Return the amount of space that should be (or was) allocated for the given * psize (compressed block size) in the given TXG. Note that for expanded * RAIDZ vdevs, the size allocated for older BP's may be larger. See * vdev_raidz_asize(). */ uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) { return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_remove_wanted(spa_t *spa, uint64_t guid) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); /* * If the vdev is already removed, or expanding which can trigger * repartition add/remove events, then don't do anything. */ if (vd->vdev_removed || vd->vdev_expanding) return (spa_vdev_state_exit(spa, NULL, 0)); /* * Confirm the vdev has been removed, otherwise don't do anything. */ if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); vd->vdev_remove_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_REMOVE); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa->spa_ccw_fail_time = 0; spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); /* * Restart trimming if necessary. We do not restart trimming for cache * devices here. This is triggered by l2arc_rebuild_vdev() * asynchronously for the whole device or in l2arc_evict() as it evicts * space for upcoming writes. */ mutex_enter(&vd->vdev_trim_lock); if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) { spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); /* * Asynchronously detach spare vdev if resilver or * rebuild is not required */ if (vd->vdev_unspare && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && !vdev_rebuild_active(tvd)) spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); } return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); if (vd->vdev_ops == &vdev_draid_spare_ops) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(space_map_allocated( tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect or removed vdev. */ if (!vdev_is_concrete(vd) || vd->vdev_removed) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); /* If a resilver isn't required, check if vdevs can be culled */ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; /* Clear recent error events cache (i.e. duplicate events tracking) */ zfs_ereport_clear(spa, vd); } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { /* * Exclude the dRAID spare when aggregating to avoid double counting * the ops and bytes. These IOs are counted by the physical leaves. */ if (cvd->vdev_ops == &vdev_draid_spare_ops) return; for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { (void) cvd; int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { memcpy(vs, &vd->vdev_stat, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_pspace = vd->vdev_psize; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report initializing progress. Since we don't * have the initializing locks held, this is only * an estimate (although a fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; /* * Report manual TRIM progress. Since we don't have * the manual TRIM locks held, this is only an * estimate (although fairly accurate one). */ vs->vs_trim_notsup = !vd->vdev_has_trim; vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; /* Set when there is a deferred resilver. */ vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN_TYPED( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift, uint64_t); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; if (vd->vdev_physical_ashift <= ASHIFT_MAX) vs->vs_physical_ashift = vd->vdev_physical_ashift; else vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { /* * The vdev fragmentation rating doesn't take into * account the embedded slog metaslab (vdev_log_mg). * Since it's only one metaslab, it would have a tiny * impact on the overall fragmentation. */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } vs->vs_noalloc = MAX(vd->vdev_noalloc, tvd ? tvd->vdev_noalloc : 0); } vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; /* Suppress ASAN false positive */ #ifdef __SANITIZE_ADDRESS__ vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL; vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL; #else vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; #endif zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { /* * Repair is the result of a resilver issued by the * scan thread (spa_sync). */ if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } /* * Repair is the result of a rebuild issued by the * rebuild thread (vdev_rebuild_thread). To avoid * double counting repaired bytes the virtual dRAID * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); } vs->vs_rebuild_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as * ZIO_TYPE_FLUSH. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_FLUSH; /* * Solely for the purposes of 'zpool iostat -lqrw' * reporting use the priority to categorize the IO. * Only the following are reported to user space: * * ZIO_PRIORITY_SYNC_READ, * ZIO_PRIORITY_SYNC_WRITE, * ZIO_PRIORITY_ASYNC_READ, * ZIO_PRIORITY_ASYNC_WRITE, * ZIO_PRIORITY_SCRUB, * ZIO_PRIORITY_TRIM, * ZIO_PRIORITY_REBUILD. */ if (priority == ZIO_PRIORITY_INITIALIZING) { ASSERT3U(type, ==, ZIO_TYPE_WRITE); priority = ZIO_PRIORITY_ASYNC_WRITE; } else if (priority == ZIO_PRIORITY_REMOVAL) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_ASYNC_READ); } vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { (void) defer_delta; int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); /* ensure we won't underflow */ if (alloc_delta < 0) { ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); } vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_spa->spa_raidz_expand == NULL || vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; VERIFY3U(pvd->vdev_children, >, 1); vdev_remove_child(pvd, vd); vdev_compact_children(pvd); ASSERT3P(pvd->vdev_child, !=, NULL); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, const char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (vq->vq_active > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %u active IOs", vd->vdev_path, vq->vq_active); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = list_head(&vq->vq_active_list); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_defer_resilver(vdev_t *vd) { ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* * Clears the resilver deferred flag on all leaf devs under vd. Returns * B_TRUE if we have devices that need to be resilvered and are available to * accept resilver I/Os. */ boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) { boolean_t resilver_needed = B_FALSE; spa_t *spa = vd->vdev_spa; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } if (vd == spa->spa_root_vdev && spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); vdev_config_dirty(vd); spa->spa_resilver_deferred = B_FALSE; return (resilver_needed); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (resilver_needed); vd->vdev_resilver_deferred = B_FALSE; return (!vdev_is_dead(vd) && !vd->vdev_offline && vdev_resilver_needed(vd, NULL, NULL)); } boolean_t vdev_xlate_is_empty(range_seg64_t *rs) { return (rs->rs_start == rs->rs_end); } /* * Translate a logical range to the first contiguous physical range for the * specified vdev_t. This function is initially called with a leaf vdev and * will walk each parent vdev until it reaches a top-level vdev. Once the * top-level is reached the physical range is initialized and the recursive * function begins to unwind. As it unwinds it calls the parent's vdev * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, remain_rs); } else { /* * We've reached the top-level vdev, initialize the physical * range to the logical range and set an empty remaining * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; remain_rs->rs_start = logical_rs->rs_start; remain_rs->rs_end = logical_rs->rs_start; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical and any remaining components by calling * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg) { range_seg64_t iter_rs = *logical_rs; range_seg64_t physical_rs; range_seg64_t remain_rs; while (!vdev_xlate_is_empty(&iter_rs)) { vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); /* * With raidz and dRAID, it's possible that the logical range * does not live on this leaf vdev. Only when there is a non- * zero physical size call the provided function. */ if (!vdev_xlate_is_empty(&physical_rs)) func(arg, &physical_rs); iter_rs = remain_rs; } } static char * vdev_name(vdev_t *vd, char *buf, int buflen) { if (vd->vdev_path == NULL) { if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) { strlcpy(buf, vd->vdev_spa->spa_name, buflen); } else if (!vd->vdev_ops->vdev_op_leaf) { snprintf(buf, buflen, "%s-%llu", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id); } } else { strlcpy(buf, vd->vdev_path, buflen); } return (buf); } /* * Look at the vdev tree and determine whether any devices are currently being * replaced. */ boolean_t vdev_replace_in_progress(vdev_t *vdev) { ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); if (vdev->vdev_ops == &vdev_replacing_ops) return (B_TRUE); /* * A 'spare' vdev indicates that we have a replace in progress, unless * it has exactly two children, and the second, the hot spare, has * finished being resilvered. */ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) return (B_TRUE); for (int i = 0; i < vdev->vdev_children; i++) { if (vdev_replace_in_progress(vdev->vdev_child[i])) return (B_TRUE); } return (B_FALSE); } /* * Add a (source=src, propname=propval) list to an nvlist. */ static void vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval, uint64_t intval, zprop_source_t src) { nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } static void vdev_props_set_sync(void *arg, dmu_tx_t *tx) { vdev_t *vd; nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; uint64_t vdev_guid; uint64_t objid; nvlist_t *nvprops; vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS); vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); /* this vdev could get removed while waiting for this sync task */ if (vd == NULL) return; /* * Set vdev property values in the vdev props mos object. */ if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { panic("unexpected vdev type"); } mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { uint64_t intval; const char *strval; vdev_prop_t prop; const char *propname = nvpair_name(elem); zprop_type_t proptype; switch (prop = vdev_name_to_prop(propname)) { case VDEV_PROP_USERPROP: if (vdev_prop_user(propname)) { strval = fnvpair_value_string(elem); if (strlen(strval) == 0) { /* remove the property if value == "" */ (void) zap_remove(mos, objid, propname, tx); } else { VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); } spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } break; default: /* normalize the property name */ propname = vdev_prop_to_name(prop); proptype = vdev_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(vdev_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, objid, propname, sizeof (uint64_t), 1, &intval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%lld", (u_longlong_t)vdev_guid, nvpair_name(elem), (longlong_t)intval); } else { panic("invalid vdev property type %u", nvpair_type(elem)); } } } mutex_exit(&spa->spa_props_lock); } int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; nvpair_t *elem = NULL; uint64_t vdev_guid; nvlist_t *nvprops; int error = 0; ASSERT(vd != NULL); /* Check that vdev has a zap we can use */ if (vd->vdev_root_zap == 0 && vd->vdev_top_zap == 0 && vd->vdev_leaf_zap == 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS, &nvprops) != 0) return (SET_ERROR(EINVAL)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) return (SET_ERROR(EINVAL)); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { const char *propname = nvpair_name(elem); vdev_prop_t prop = vdev_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) { error = EINVAL; goto end; } if (vdev_prop_readonly(prop)) { error = EROFS; goto end; } /* Special Processing */ switch (prop) { case VDEV_PROP_PATH: if (vd->vdev_path == NULL) { error = EROFS; break; } if (nvpair_value_string(elem, &strval) != 0) { error = EINVAL; break; } /* New path must start with /dev/ */ if (strncmp(strval, "/dev/", 5)) { error = EINVAL; break; } error = spa_vdev_setpath(spa, vdev_guid, strval); break; case VDEV_PROP_ALLOCATING: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } if (intval != vd->vdev_noalloc) break; if (intval == 0) error = spa_vdev_noalloc(spa, vdev_guid); else error = spa_vdev_alloc(spa, vdev_guid); break; case VDEV_PROP_FAILFAST: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_failfast = intval & 1; break; case VDEV_PROP_CHECKSUM_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_n = intval; break; case VDEV_PROP_CHECKSUM_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_t = intval; break; case VDEV_PROP_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_n = intval; break; case VDEV_PROP_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_t = intval; break; case VDEV_PROP_SLOW_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_slow_io_n = intval; break; case VDEV_PROP_SLOW_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_slow_io_t = intval; break; default: /* Most processing is done in vdev_props_set_sync */ break; } end: if (error != 0) { intval = error; vdev_prop_add_list(outnvl, propname, strval, intval, 0); return (error); } } return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync, innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } int vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int err = 0; uint64_t objid; uint64_t vdev_guid; nvpair_t *elem = NULL; nvlist_t *nvprops = NULL; uint64_t intval = 0; char *strval = NULL; const char *propname = NULL; vdev_prop_t prop; ASSERT(vd != NULL); ASSERT(mos != NULL); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (SET_ERROR(EINVAL)); } ASSERT(objid != 0); mutex_enter(&spa->spa_props_lock); if (nvprops != NULL) { char namebuf[64] = { 0 }; while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { intval = 0; strval = NULL; propname = nvpair_name(elem); prop = vdev_name_to_prop(propname); zprop_source_t src = ZPROP_SRC_DEFAULT; uint64_t integer_size, num_integers; switch (prop) { /* Special Read-only Properties */ case VDEV_PROP_NAME: strval = vdev_name(vd, namebuf, sizeof (namebuf)); if (strval == NULL) continue; vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_CAPACITY: /* percent used */ intval = (vd->vdev_stat.vs_dspace == 0) ? 0 : (vd->vdev_stat.vs_alloc * 100 / vd->vdev_stat.vs_dspace); vdev_prop_add_list(outnvl, propname, NULL, intval, ZPROP_SRC_NONE); continue; case VDEV_PROP_STATE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_state, ZPROP_SRC_NONE); continue; case VDEV_PROP_GUID: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_guid, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_asize, ZPROP_SRC_NONE); continue; case VDEV_PROP_PSIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_psize, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASHIFT: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_ashift, ZPROP_SRC_NONE); continue; case VDEV_PROP_SIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE); continue; case VDEV_PROP_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace - vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_ALLOCATED: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_EXPANDSZ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_esize, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRAGMENTATION: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_fragmentation, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARITY: vdev_prop_add_list(outnvl, propname, NULL, vdev_get_nparity(vd), ZPROP_SRC_NONE); continue; case VDEV_PROP_PATH: if (vd->vdev_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_DEVID: if (vd->vdev_devid == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_devid, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PHYS_PATH: if (vd->vdev_physpath == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_physpath, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_ENC_PATH: if (vd->vdev_enc_sysfs_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRU: if (vd->vdev_fru == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_fru, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARENT: if (vd->vdev_parent != NULL) { strval = vdev_name(vd->vdev_parent, namebuf, sizeof (namebuf)); vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); } continue; case VDEV_PROP_CHILDREN: if (vd->vdev_children > 0) strval = kmem_zalloc(ZAP_MAXVALUELEN, KM_SLEEP); for (uint64_t i = 0; i < vd->vdev_children; i++) { const char *vname; vname = vdev_name(vd->vdev_child[i], namebuf, sizeof (namebuf)); if (vname == NULL) vname = "(unknown)"; if (strlen(strval) > 0) strlcat(strval, ",", ZAP_MAXVALUELEN); strlcat(strval, vname, ZAP_MAXVALUELEN); } if (strval != NULL) { vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); kmem_free(strval, ZAP_MAXVALUELEN); } continue; case VDEV_PROP_NUMCHILDREN: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_children, ZPROP_SRC_NONE); continue; case VDEV_PROP_READ_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_read_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_WRITE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_write_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_CHECKSUM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_checksum_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_INITIALIZE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_initialize_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_TRIM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_trim_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_SLOW_IOS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_slow_ios, ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_REMOVING: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_removing, ZPROP_SRC_NONE); continue; case VDEV_PROP_RAIDZ_EXPANDING: /* Only expose this for raidz */ if (vd->vdev_ops == &vdev_raidz_ops) { vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_rz_expanding, ZPROP_SRC_NONE); } continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_has_trim, ZPROP_SRC_NONE); } continue; /* Numeric Properites */ case VDEV_PROP_ALLOCATING: /* Leaf vdevs cannot have this property */ if (vd->vdev_mg == NULL && vd->vdev_top != NULL) { src = ZPROP_SRC_NONE; intval = ZPROP_BOOLEAN_NA; } else { err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; } vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; case VDEV_PROP_FAILFAST: src = ZPROP_SRC_LOCAL; strval = NULL; err = zap_lookup(mos, objid, nvpair_name(elem), sizeof (uint64_t), 1, &intval); if (err == ENOENT) { intval = vdev_prop_default_numeric( prop); err = 0; } else if (err) { break; } if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; vdev_prop_add_list(outnvl, propname, strval, intval, src); break; case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: case VDEV_PROP_SLOW_IO_N: case VDEV_PROP_SLOW_IO_T: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; /* Text Properties */ case VDEV_PROP_COMMENT: /* Exists in the ZAP below */ /* FALLTHRU */ case VDEV_PROP_USERPROP: /* User Properites */ src = ZPROP_SRC_LOCAL; err = zap_length(mos, objid, nvpair_name(elem), &integer_size, &num_integers); if (err) break; switch (integer_size) { case 8: /* User properties cannot be integers */ err = EINVAL; break; case 1: /* string property */ strval = kmem_alloc(num_integers, KM_SLEEP); err = zap_lookup(mos, objid, nvpair_name(elem), 1, num_integers, strval); if (err) { kmem_free(strval, num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, num_integers); break; } break; default: err = ENOENT; break; } if (err) break; } } else { /* * Get all properties from the MOS vdev property object. */ zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, mos, objid); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { intval = 0; strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; propname = za->za_name; switch (za->za_integer_length) { case 8: /* We do not allow integer user properties */ /* This is likely an internal value */ break; case 1: /* string property */ strval = kmem_alloc(za->za_num_integers, KM_SLEEP); err = zap_lookup(mos, objid, za->za_name, 1, za->za_num_integers, strval); if (err) { kmem_free(strval, za->za_num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, za->za_num_integers); break; default: break; } } zap_cursor_fini(&zc); zap_attribute_free(za); } mutex_exit(&spa->spa_props_lock); if (err && err != ENOENT) { return (err); } return (0); } EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW, "Default lower limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW, "Default upper limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, "Rate limit hung IO (deadman) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, "Rate Direct I/O write verify events to this many per second"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW, "Direct I/O writes will perform for checksum verification before " "commiting write"); ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below ZED threshold)."); /* END CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); /* END CSTYLED */