diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 5fe3a2006f8c..4d57e52e8468 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -1,152 +1,153 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_METASLAB_H #define _SYS_METASLAB_H #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct metaslab_ops { const char *msop_name; uint64_t (*msop_alloc)(metaslab_t *, uint64_t, uint64_t, uint64_t *); } metaslab_ops_t; extern const metaslab_ops_t zfs_metaslab_ops; int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, metaslab_t **); void metaslab_fini(metaslab_t *); void metaslab_set_unflushed_dirty(metaslab_t *, boolean_t); void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *); void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *); boolean_t metaslab_unflushed_dirty(metaslab_t *); uint64_t metaslab_unflushed_txg(metaslab_t *); uint64_t metaslab_estimated_condensed_size(metaslab_t *); int metaslab_sort_by_flushed(const void *, const void *); void metaslab_unflushed_bump(metaslab_t *, dmu_tx_t *, boolean_t); uint64_t metaslab_unflushed_changes_memused(metaslab_t *); int metaslab_load(metaslab_t *); void metaslab_unload(metaslab_t *); boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *); uint64_t metaslab_allocated_space(metaslab_t *); void metaslab_sync(metaslab_t *, uint64_t); void metaslab_sync_done(metaslab_t *, uint64_t); void metaslab_sync_reassess(metaslab_group_t *); uint64_t metaslab_largest_allocatable(metaslab_t *); /* * metaslab alloc flags */ #define METASLAB_ZIL 0x1 #define METASLAB_GANG_HEADER 0x2 #define METASLAB_GANG_CHILD 0x4 #define METASLAB_ASYNC_ALLOC 0x8 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, const blkptr_t *, int, zio_alloc_list_t *, int, const void *); int metaslab_alloc_range(spa_t *, metaslab_class_t *, uint64_t, uint64_t, blkptr_t *, int, uint64_t, const blkptr_t *, int, zio_alloc_list_t *, int, const void *, uint64_t *); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, dva_t *, int, const dva_t *, uint64_t, int, zio_alloc_list_t *, int); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *); void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_stat_init(void); void metaslab_stat_fini(void); void metaslab_trace_move(zio_alloc_list_t *, zio_alloc_list_t *); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); -metaslab_class_t *metaslab_class_create(spa_t *, const metaslab_ops_t *, - boolean_t); +metaslab_class_t *metaslab_class_create(spa_t *, const char *, + const metaslab_ops_t *, boolean_t); void metaslab_class_destroy(metaslab_class_t *); void metaslab_class_validate(metaslab_class_t *); void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, zio_t *, boolean_t, boolean_t *); boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *); void metaslab_class_evict_old(metaslab_class_t *, uint64_t); +const char *metaslab_class_get_name(metaslab_class_t *); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); uint64_t metaslab_class_get_deferred(metaslab_class_t *); void metaslab_space_update(vdev_t *, metaslab_class_t *, int64_t, int64_t, int64_t); metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); void metaslab_group_passivate(metaslab_group_t *); boolean_t metaslab_group_initialized(metaslab_group_t *); uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); void metaslab_group_alloc_increment_all(spa_t *, blkptr_t *, int, int, uint64_t, const void *); void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t, const void *); void metaslab_recalculate_weight_and_sort(metaslab_t *); void metaslab_disable(metaslab_t *); void metaslab_enable(metaslab_t *, boolean_t, boolean_t); void metaslab_set_selected_txg(metaslab_t *, uint64_t); extern int metaslab_debug_load; zfs_range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, uint64_t *start, uint64_t *shift); #ifdef __cplusplus } #endif #endif /* _SYS_METASLAB_H */ diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 501ccf3cb6cd..83fbe620fe37 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -1,545 +1,546 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2011, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H #define _SYS_METASLAB_IMPL_H #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Metaslab allocation tracing record. */ typedef struct metaslab_alloc_trace { list_node_t mat_list_node; metaslab_group_t *mat_mg; metaslab_t *mat_msp; uint64_t mat_size; uint64_t mat_weight; uint32_t mat_dva_id; uint64_t mat_offset; int mat_allocator; } metaslab_alloc_trace_t; /* * Used by the metaslab allocation tracing facility to indicate * error conditions. These errors are stored to the offset member * of the metaslab_alloc_trace_t record and displayed by mdb. */ typedef enum trace_alloc_type { TRACE_ALLOC_FAILURE = -1ULL, TRACE_TOO_SMALL = -2ULL, TRACE_FORCE_GANG = -3ULL, TRACE_NOT_ALLOCATABLE = -4ULL, TRACE_GROUP_FAILURE = -5ULL, TRACE_ENOSPC = -6ULL, TRACE_CONDENSING = -7ULL, TRACE_VDEV_ERROR = -8ULL, TRACE_DISABLED = -9ULL, } trace_alloc_type_t; #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_WEIGHT_CLAIM (1ULL << 61) #define METASLAB_WEIGHT_TYPE (1ULL << 60) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ METASLAB_WEIGHT_CLAIM) /* * The metaslab weight is used to encode the amount of free space in a * metaslab, such that the "best" metaslab appears first when sorting the * metaslabs by weight. The weight (and therefore the "best" metaslab) can * be determined in two different ways: by computing a weighted sum of all * the free space in the metaslab (a space based weight) or by counting only * the free segments of the largest size (a segment based weight). We prefer * the segment based weight because it reflects how the free space is * comprised, but we cannot always use it -- legacy pools do not have the * space map histogram information necessary to determine the largest * contiguous regions. Pools that have the space map histogram determine * the segment weight by looking at each bucket in the histogram and * determining the free space whose size in bytes is in the range: * [2^i, 2^(i+1)) * We then encode the largest index, i, that contains regions into the * segment-weighted value. * * Space-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * |PSC1| weighted-free space | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * space - the fragmentation-weighted space * * Segment-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * |PSC0| idx| count of segments in region | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation * C - indicates activation for claimed block zio * idx - index for the highest bucket in the histogram * count - number of segments in the specified bucket */ #define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) #define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) #define WEIGHT_IS_SPACEBASED(weight) \ ((weight) == 0 || BF64_GET((weight), 60, 1)) #define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) /* * These macros are only applicable to segment-based weighting. */ #define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) #define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) /* * Per-allocator data structure. */ typedef struct metaslab_class_allocator { kmutex_t mca_lock; avl_tree_t mca_tree; metaslab_group_t *mca_rotor; uint64_t mca_aliquot; /* * The allocation throttle works on a reservation system. Whenever * an asynchronous zio wants to perform an allocation it must * first reserve the number of bytes that it wants to allocate. * If there aren't sufficient slots available for the pending zio * then that I/O is throttled until more slots free up. The current * size of reserved allocations is maintained by mca_reserved. * The maximum total size of reserved allocations is determined by * mc_alloc_max in the metaslab_class_t. Gang blocks are allowed * to reserve for their headers even if we've reached the maximum. */ uint64_t mca_reserved; } ____cacheline_aligned metaslab_class_allocator_t; /* * A metaslab class encompasses a category of allocatable top-level vdevs. * Each top-level vdev is associated with a metaslab group which defines * the allocatable region for that vdev. Examples of these categories include * "normal" for data block allocations (i.e. main pool allocations) or "log" * for allocations designated for intent log devices (i.e. slog devices). * When a block allocation is requested from the SPA it is associated with a * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging * to the class can be used to satisfy that request. Allocations are done * by traversing the metaslab groups that are linked off of the mca_rotor field. * This rotor points to the next metaslab group where allocations will be * attempted. Allocating a block is a 3 step process -- select the metaslab * group, select the metaslab, and then allocate the block. The metaslab * class defines the low-level block allocator that will be used as the * final step in allocation. These allocators are pluggable allowing each class * to use a block allocator that best suits that class. */ struct metaslab_class { kmutex_t mc_lock; spa_t *mc_spa; - const metaslab_ops_t *mc_ops; + const char *mc_name; + const metaslab_ops_t *mc_ops; /* * Track the number of metaslab groups that have been initialized * and can accept allocations. An initialized metaslab group is * one has been completely added to the config (i.e. we have * updated the MOS config and the space has been added to the pool). */ uint64_t mc_groups; boolean_t mc_is_log; boolean_t mc_alloc_throttle_enabled; uint64_t mc_alloc_io_size; uint64_t mc_alloc_max; uint64_t mc_alloc_groups; /* # of allocatable groups */ uint64_t mc_alloc; /* total allocated space */ uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ uint64_t mc_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE]; /* * List of all loaded metaslabs in the class, sorted in order of most * recent use. */ multilist_t mc_metaslab_txg_list; metaslab_class_allocator_t mc_allocator[]; }; /* * Per-allocator data structure. */ typedef struct metaslab_group_allocator { zfs_refcount_t mga_queue_depth; metaslab_t *mga_primary; metaslab_t *mga_secondary; } ____cacheline_aligned metaslab_group_allocator_t; /* * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) * of a top-level vdev. They are linked together to form a circular linked * list and can belong to only one metaslab class. Metaslab groups may become * ineligible for allocations for a number of reasons such as limited free * space, fragmentation, or going offline. When this happens the allocator will * simply find the next metaslab group in the linked list and attempt * to allocate from that group instead. */ struct metaslab_group { kmutex_t mg_lock; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; uint64_t mg_queue_target; boolean_t mg_allocatable; /* can we allocate? */ uint64_t mg_ms_ready; /* * A metaslab group is considered to be initialized only after * we have updated the MOS config and added the space to the pool. * We only allow allocation attempts to a metaslab group if it * has been initialized. */ boolean_t mg_initialized; int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; /* * A metalab group that can no longer allocate the minimum block * size will set mg_no_free_space. Once a metaslab group is out * of space then its share of work must be distributed to other * groups. */ boolean_t mg_no_free_space; uint64_t mg_fragmentation; uint64_t mg_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE]; int mg_ms_disabled; boolean_t mg_disabled_updating; kmutex_t mg_ms_disabled_lock; kcondvar_t mg_ms_disabled_cv; metaslab_group_allocator_t mg_allocator[]; }; /* * This value defines the number of elements in the ms_lbas array. The value * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. * This is the equivalent of highbit(UINT64_MAX). */ #define MAX_LBAS 64 /* * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_allocatable) contains the list of * free segments which are eligible for allocation. As blocks are * allocated, the allocated segments are removed from the ms_allocatable and * added to a per txg allocation tree (ms_allocating). As blocks are * freed, they are added to the free tree (ms_freeing). These trees * allow us to process all allocations and frees in syncing context * where it is safe to update the on-disk space maps. An additional set * of in-core trees is maintained to track deferred frees * (ms_defer). Once a block is freed it will move from the * ms_freed to the ms_defer tree. A deferred free means that a block * has been freed but cannot be used by the pool until TXG_DEFER_SIZE * transactions groups later. For example, a block that is freed in txg * 50 will not be available for reallocation until txg 52 (50 + * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. * A pool could be safely rolled back TXG_DEFERS_SIZE transactions * groups and ensure that no block has been reallocated. * * The simplified transition diagram looks like this: * * * ALLOCATE * | * V * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) * ^ * | ms_freeing <--- FREE * | | * | v * | ms_freed * | | * +-------- ms_defer[2] <-------+-------> (write to space map) * * * Each metaslab's space is tracked in a single space map in the MOS, * which is only updated in syncing context. Each time we sync a txg, * we append the allocs and frees from that txg to the space map. The * pool space is only updated once all metaslabs have finished syncing. * * To load the in-core free tree we read the space map from disk. This * object contains a series of alloc and free records that are combined * to make up the list of all free segments in this metaslab. These * segments are represented in-core by the ms_allocatable and are stored * in an AVL tree. * * As the space map grows (as a result of the appends) it will * eventually become space-inefficient. When the metaslab's in-core * free tree is zfs_condense_pct/100 times the size of the minimal * on-disk representation, we rewrite it in its minimized form. If a * metaslab needs to condense then we must set the ms_condensing flag to * ensure that allocations are not performed on the metaslab that is * being written. */ struct metaslab { /* * This is the main lock of the metaslab and its purpose is to * coordinate our allocations and frees [e.g., metaslab_block_alloc(), * metaslab_free_concrete(), ..etc] with our various syncing * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc]. * * The lock is also used during some miscellaneous operations like * using the metaslab's histogram for the metaslab group's histogram * aggregation, or marking the metaslab for initialization. */ kmutex_t ms_lock; /* * Acquired together with the ms_lock whenever we expect to * write to metaslab data on-disk (i.e flushing entries to * the metaslab's space map). It helps coordinate readers of * the metaslab's space map [see spa_vdev_remove_thread()] * with writers [see metaslab_sync() or metaslab_flush()]. * * Note that metaslab_load(), even though a reader, uses * a completely different mechanism to deal with the reading * of the metaslab's space map based on ms_synced_length. That * said, the function still uses the ms_sync_lock after it * has read the ms_sm [see relevant comment in metaslab_load() * as to why]. */ kmutex_t ms_sync_lock; kcondvar_t ms_load_cv; space_map_t *ms_sm; uint64_t ms_id; uint64_t ms_start; uint64_t ms_size; uint64_t ms_fragmentation; zfs_range_tree_t *ms_allocating[TXG_SIZE]; zfs_range_tree_t *ms_allocatable; uint64_t ms_allocated_this_txg; uint64_t ms_allocating_total; /* * The following range trees are accessed only from syncing context. * ms_free*tree only have entries while syncing, and are empty * between syncs. */ zfs_range_tree_t *ms_freeing; /* to free this syncing txg */ /* already freed this syncing txg */ zfs_range_tree_t *ms_freed; zfs_range_tree_t *ms_defer[TXG_DEFER_SIZE]; /* to add to the checkpoint */ zfs_range_tree_t *ms_checkpointing; /* * The ms_trim tree is the set of allocatable segments which are * eligible for trimming. (When the metaslab is loaded, it's a * subset of ms_allocatable.) It's kept in-core as long as the * autotrim property is set and is not vacated when the metaslab * is unloaded. Its purpose is to aggregate freed ranges to * facilitate efficient trimming. */ zfs_range_tree_t *ms_trim; boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; /* * The number of consumers which have disabled the metaslab. */ uint64_t ms_disabled; /* * We must always hold the ms_lock when modifying ms_loaded * and ms_loading. */ boolean_t ms_loaded; boolean_t ms_loading; kcondvar_t ms_flush_cv; boolean_t ms_flushing; /* * The following histograms count entries that are in the * metaslab's space map (and its histogram) but are not in * ms_allocatable yet, because they are in ms_freed, ms_freeing, * or ms_defer[]. * * When the metaslab is not loaded, its ms_weight needs to * reflect what is allocatable (i.e. what will be part of * ms_allocatable if it is loaded). The weight is computed from * the spacemap histogram, but that includes ranges that are * not yet allocatable (because they are in ms_freed, * ms_freeing, or ms_defer[]). Therefore, when calculating the * weight, we need to remove those ranges. * * The ranges in the ms_freed and ms_defer[] range trees are all * present in the spacemap. However, the spacemap may have * multiple entries to represent a contiguous range, because it * is written across multiple sync passes, but the changes of * all sync passes are consolidated into the range trees. * Adjacent ranges that are freed in different sync passes of * one txg will be represented separately (as 2 or more entries) * in the space map (and its histogram), but these adjacent * ranges will be consolidated (represented as one entry) in the * ms_freed/ms_defer[] range trees (and their histograms). * * When calculating the weight, we can not simply subtract the * range trees' histograms from the spacemap's histogram, * because the range trees' histograms may have entries in * higher buckets than the spacemap, due to consolidation. * Instead we must subtract the exact entries that were added to * the spacemap's histogram. ms_synchist and ms_deferhist[] * represent these exact entries, so we can subtract them from * the spacemap's histogram when calculating ms_weight. * * ms_synchist represents the same ranges as ms_freeing + * ms_freed, but without consolidation across sync passes. * * ms_deferhist[i] represents the same ranges as ms_defer[i], * but without consolidation across sync passes. */ uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; /* * Tracks the exact amount of allocated space of this metaslab * (and specifically the metaslab's space map) up to the most * recently completed sync pass [see usage in metaslab_sync()]. */ uint64_t ms_allocated_space; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ uint64_t ms_activation_weight; /* activation weight */ /* * Track of whenever a metaslab is selected for loading or allocation. * We use this value to determine how long the metaslab should * stay cached. */ uint64_t ms_selected_txg; /* * ms_load/unload_time can be used for performance monitoring * (e.g. by dtrace or mdb). */ hrtime_t ms_load_time; /* time last loaded */ hrtime_t ms_unload_time; /* time last unloaded */ uint64_t ms_selected_time; /* time last allocated from (secs) */ uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_max_size; /* maximum allocatable size */ /* * -1 if it's not active in an allocator, otherwise set to the allocator * this metaslab is active for. */ int ms_allocator; boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ /* * The metaslab block allocators can optionally use a size-ordered * range tree and/or an array of LBAs. Not all allocators use * this functionality. The ms_allocatable_by_size should always * contain the same number of segments as the ms_allocatable. The * only difference is that the ms_allocatable_by_size is ordered by * segment sizes. */ zfs_btree_t ms_allocatable_by_size; zfs_btree_t ms_unflushed_frees_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ /* * Node in metaslab class's selected txg list */ multilist_node_t ms_class_txg_node; /* * Allocs and frees that are committed to the vdev log spacemap but * not yet to this metaslab's spacemap. */ zfs_range_tree_t *ms_unflushed_allocs; zfs_range_tree_t *ms_unflushed_frees; /* * We have flushed entries up to but not including this TXG. In * other words, all changes from this TXG and onward should not * be in this metaslab's space map and must be read from the * log space maps. */ uint64_t ms_unflushed_txg; boolean_t ms_unflushed_dirty; /* updated every time we are done syncing the metaslab's space map */ uint64_t ms_synced_length; boolean_t ms_new; }; typedef struct metaslab_unflushed_phys { /* on-disk counterpart of ms_unflushed_txg */ uint64_t msp_unflushed_txg; } metaslab_unflushed_phys_t; #ifdef __cplusplus } #endif #endif /* _SYS_METASLAB_IMPL_H */ diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 42c43216392c..43b94eba2d58 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1,6395 +1,6405 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) /* * Metaslab group's per child vdev granularity, in bytes. This is roughly * similar to what would be referred to as the "stripe size" in traditional * RAID arrays. In normal operation, we will try to write this amount of * data to each disk before moving on to the next top-level vdev. */ static uint64_t metaslab_aliquot = 2 * 1024 * 1024; /* * For testing, make some blocks above a certain size be gang blocks. */ uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* * Of blocks of size >= metaslab_force_ganging, actually gang them this often. */ uint_t metaslab_force_ganging_pct = 3; /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered * around the disk. So a sane default for the space map block size * is 8~16K. */ int zfs_metaslab_sm_blksz_no_log = (1 << 14); /* * When the log space map feature is enabled, we accumulate a lot of * changes per metaslab that are flushed once in a while so we benefit * from a bigger block size like 128K for the metaslab space maps. */ int zfs_metaslab_sm_blksz_with_log = (1 << 17); /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ uint_t zfs_condense_pct = 200; /* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksz), so a metaslab might use the * same number of blocks after condensing. Since the goal of condensing is to * reduce the number of IOPs required to read the space map, we only want to * condense when we can be sure we will reduce the number of blocks used by the * space map. Unfortunately, we cannot precisely compute whether or not this is * the case in metaslab_should_condense since we are holding ms_lock. Instead, * we apply the following heuristic: do not condense a spacemap unless the * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold * blocks. */ static const int zfs_metaslab_condense_block_threshold = 4; /* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of * free space. Metaslab groups that have more free space than * zfs_mg_noalloc_threshold are always eligible for allocations. Once * a metaslab group's free space is less than or equal to the * zfs_mg_noalloc_threshold the allocator will avoid allocating to that * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. * Once all groups in the pool reach zfs_mg_noalloc_threshold then all * groups are allowed to accept allocations. Gang blocks are always * eligible to allocate on any metaslab group. The default value of 0 means * no metaslab group will be excluded based on this criterion. */ static uint_t zfs_mg_noalloc_threshold = 0; /* * Metaslab groups are considered eligible for allocations if their * fragmentation metric (measured as a percentage) is less than or * equal to zfs_mg_fragmentation_threshold. If a metaslab group * exceeds this threshold then it will be skipped unless all metaslab * groups within the metaslab class have also crossed this threshold. * * This tunable was introduced to avoid edge cases where we continue * allocating from very fragmented disks in our pool while other, less * fragmented disks, exists. On the other hand, if all disks in the * pool are uniformly approaching the threshold, the threshold can * be a speed bump in performance, where we keep switching the disks * that we allocate from (e.g. we allocate some segments from disk A * making it bypassing the threshold while freeing segments from disk * B getting its fragmentation below the threshold). * * Empirically, we've seen that our vdev selection for allocations is * good enough that fragmentation increases uniformly across all vdevs * the majority of the time. Thus we set the threshold percentage high * enough to avoid hitting the speed bump on pools that are being pushed * to the edge. */ static uint_t zfs_mg_fragmentation_threshold = 95; /* * Allow metaslabs to keep their active state as long as their fragmentation * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An * active metaslab that exceeds this threshold will no longer keep its active * status allowing better metaslabs to be selected. */ static uint_t zfs_metaslab_fragmentation_threshold = 77; /* * When set will load all metaslabs when pool is first opened. */ int metaslab_debug_load = B_FALSE; /* * When set will prevent metaslabs from being unloaded. */ static int metaslab_debug_unload = B_FALSE; /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ uint_t metaslab_df_free_pct = 4; /* * Maximum distance to search forward from the last offset. Without this * limit, fragmented pools can see >100,000 iterations and * metaslab_block_picker() becomes the performance limiting factor on * high-performance storage. * * With the default setting of 16MB, we typically see less than 500 * iterations, even with very fragmented, ashift=9 pools. The maximum number * of iterations possible is: * metaslab_df_max_search / (2 * (1<60KB (but fewer segments in this * bucket, and therefore a lower weight). */ static uint_t zfs_metaslab_find_max_tries = 100; static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); static void metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg); kmem_cache_t *metaslab_alloc_trace_cache; typedef struct metaslab_stats { kstat_named_t metaslabstat_trace_over_limit; kstat_named_t metaslabstat_reload_tree; kstat_named_t metaslabstat_too_many_tries; kstat_named_t metaslabstat_try_hard; } metaslab_stats_t; static metaslab_stats_t metaslab_stats = { { "trace_over_limit", KSTAT_DATA_UINT64 }, { "reload_tree", KSTAT_DATA_UINT64 }, { "too_many_tries", KSTAT_DATA_UINT64 }, { "try_hard", KSTAT_DATA_UINT64 }, }; #define METASLABSTAT_BUMP(stat) \ atomic_inc_64(&metaslab_stats.stat.value.ui64); static kstat_t *metaslab_ksp; void metaslab_stat_init(void) { ASSERT(metaslab_alloc_trace_cache == NULL); metaslab_alloc_trace_cache = kmem_cache_create( "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (metaslab_ksp != NULL) { metaslab_ksp->ks_data = &metaslab_stats; kstat_install(metaslab_ksp); } } void metaslab_stat_fini(void) { if (metaslab_ksp != NULL) { kstat_delete(metaslab_ksp); metaslab_ksp = NULL; } kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } /* * ========================================================================== * Metaslab classes * ========================================================================== */ metaslab_class_t * -metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops, boolean_t is_log) +metaslab_class_create(spa_t *spa, const char *name, + const metaslab_ops_t *ops, boolean_t is_log) { metaslab_class_t *mc; mc = kmem_zalloc(offsetof(metaslab_class_t, mc_allocator[spa->spa_alloc_count]), KM_SLEEP); mc->mc_spa = spa; + mc->mc_name = name; mc->mc_ops = ops; mc->mc_is_log = is_log; mc->mc_alloc_io_size = SPA_OLD_MAXBLOCKSIZE; mc->mc_alloc_max = UINT64_MAX; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; mutex_init(&mca->mca_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&mca->mca_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); mca->mca_rotor = NULL; mca->mca_reserved = 0; } return (mc); } void metaslab_class_destroy(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; avl_destroy(&mca->mca_tree); mutex_destroy(&mca->mca_lock); ASSERT(mca->mca_rotor == NULL); ASSERT0(mca->mca_reserved); } mutex_destroy(&mc->mc_lock); multilist_destroy(&mc->mc_metaslab_txg_list); kmem_free(mc, offsetof(metaslab_class_t, mc_allocator[spa->spa_alloc_count])); } void metaslab_class_validate(metaslab_class_t *mc) { #ifdef ZFS_DEBUG spa_t *spa = mc->mc_spa; /* * Must hold one of the spa_config locks. */ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) || spa_config_held(spa, SCL_ALL, RW_WRITER)); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; metaslab_group_t *mg, *rotor; ASSERT0(avl_numnodes(&mca->mca_tree)); ASSERT0(mca->mca_reserved); if ((mg = rotor = mca->mca_rotor) == NULL) continue; do { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; vdev_t *vd = mg->mg_vd; ASSERT3P(vd->vdev_top, ==, vd); ASSERT(vd->vdev_mg == mg || vd->vdev_log_mg == mg); ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); ASSERT0(zfs_refcount_count(&mga->mga_queue_depth)); } while ((mg = mg->mg_next) != rotor); } #endif } /* * For each metaslab group in a class pre-calculate allocation quota and * target queue depth to balance their space usage and write performance. * Based on those pre-calculate class allocation throttle threshold for * optimal saturation. onsync is true once per TXG to enable/disable * allocation throttling and update moving average of maximum I/O size. */ void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync) { metaslab_group_t *mg, *first; /* * Must hold one of the spa_config locks. */ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); if (onsync) metaslab_class_validate(mc); if (mc->mc_groups == 0) { if (onsync) mc->mc_alloc_throttle_enabled = B_FALSE; mc->mc_alloc_max = UINT64_MAX; return; } if (onsync) { /* * Moving average of maximum allocation size, in absence of * large allocations shrinking to 1/8 of metaslab_aliquot. */ mc->mc_alloc_io_size = (3 * mc->mc_alloc_io_size + metaslab_aliquot / 8) / 4; mc->mc_alloc_throttle_enabled = mc->mc_is_log ? 0 : zio_dva_throttle_enabled; } mg = first = mc->mc_allocator[0].mca_rotor; uint64_t children = 0; do { children += vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd); } while ((mg = mg->mg_next) != first); uint64_t sum_aliquot = 0; do { vdev_stat_t *vs = &mg->mg_vd->vdev_stat; uint_t ratio; /* * Scale allocations per iteration with average number of * children. Wider vdevs need more sequential allocations * to keep decent per-child I/O size. */ uint64_t mg_aliquot = MAX(metaslab_aliquot * children / mc->mc_groups, mc->mc_alloc_io_size * 4); /* * Scale allocations per iteration with the vdev capacity, * relative to average. Bigger vdevs should get more to * fill up at the same time as smaller ones. */ if (mc->mc_space > 0 && vs->vs_space > 0) { ratio = vs->vs_space / (mc->mc_space / (mc->mc_groups * 256) + 1); mg_aliquot = mg_aliquot * ratio / 256; } /* * Scale allocations per iteration with the vdev's free space * fraction, relative to average. Despite the above, vdevs free * space fractions may get imbalanced, for example due to new * vdev addition or different performance. We want free space * fractions to be similar to postpone fragmentation. * * But same time we don't want to throttle vdevs still having * plenty of free space, that appear faster than others, even * if that cause temporary imbalance. Allow them to allocate * more by keeping their allocation queue depth equivalent to * 2.5 full iteration, even if they repeatedly drain it. Later * with the free space reduction gradually reduce the target * queue depth, stronger enforcing the free space balance. */ if (metaslab_bias_enabled && mc->mc_space > 0 && vs->vs_space > 0) { uint64_t vs_free = vs->vs_space > vs->vs_alloc ? vs->vs_space - vs->vs_alloc : 0; uint64_t mc_free = mc->mc_space > mc->mc_alloc ? mc->mc_space - mc->mc_alloc : 0; /* * vs_fr is 16 bit fixed-point free space fraction. * mc_fr is 8 bit fixed-point free space fraction. * ratio as their quotient is 8 bit fixed-point. */ uint_t vs_fr = vs_free / (vs->vs_space / 65536 + 1); uint_t mc_fr = mc_free / (mc->mc_space / 256 + 1); ratio = vs_fr / (mc_fr + 1); mg->mg_aliquot = mg_aliquot * ratio / 256; /* From 2.5x at 25% full to 1x at 75%. */ ratio = MIN(163840, vs_fr * 3 + 16384); mg->mg_queue_target = MAX(mg->mg_aliquot, mg->mg_aliquot * ratio / 65536); } else { mg->mg_aliquot = mg_aliquot; mg->mg_queue_target = mg->mg_aliquot * 2; } sum_aliquot += mg->mg_aliquot; } while ((mg = mg->mg_next) != first); /* * Set per-class allocation throttle threshold to 4 iterations through * all the vdevs. This should keep all vdevs busy even if some are * allocating more than we planned for them due to bigger blocks or * better performance. */ mc->mc_alloc_max = sum_aliquot * 4; } static void metaslab_class_rotate(metaslab_group_t *mg, int allocator, uint64_t psize, boolean_t success) { metaslab_class_t *mc = mg->mg_class; metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; /* * Exit fast if there is nothing to rotate, we are not following * the rotor (copies, gangs, etc) or somebody already rotated it. */ if (mc->mc_groups < 2 || mca->mca_rotor != mg) return; /* * Always rotate in case of allocation error or a log class. */ if (!success || mc->mc_is_log) goto rotate; /* * Allocate from this group if we expect next I/O of the same size to * mostly fit within the allocation quota. Rotate if we expect it to * mostly go over the target queue depth. Meanwhile, to stripe between * groups in configured amounts per child even if we can't reach the * target queue depth, i.e. can't saturate the group write performance, * always rotate after allocating the queue target bytes. */ uint64_t naq = atomic_add_64_nv(&mca->mca_aliquot, psize) + psize / 2; if (naq < mg->mg_aliquot) return; if (naq >= mg->mg_queue_target) goto rotate; if (zfs_refcount_count(&mga->mga_queue_depth) + psize + psize / 2 >= mg->mg_queue_target) goto rotate; /* * When the pool is not too busy, prefer restoring the vdev free space * balance instead of getting maximum speed we might not need, so that * we could have more flexibility during more busy times later. */ if (metaslab_perf_bias <= 0) goto rotate; if (metaslab_perf_bias >= 2) return; spa_t *spa = mc->mc_spa; dsl_pool_t *dp = spa_get_dsl(spa); if (dp == NULL) return; uint64_t busy_thresh = zfs_dirty_data_max * (zfs_vdev_async_write_active_min_dirty_percent + zfs_vdev_async_write_active_max_dirty_percent) / 200; if (dp->dp_dirty_total > busy_thresh || spa_has_pending_synctask(spa)) return; rotate: mca->mca_rotor = mg->mg_next; mca->mca_aliquot = 0; } static void metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { atomic_add_64(&mc->mc_alloc, alloc_delta); atomic_add_64(&mc->mc_deferred, defer_delta); atomic_add_64(&mc->mc_space, space_delta); atomic_add_64(&mc->mc_dspace, dspace_delta); } +const char * +metaslab_class_get_name(metaslab_class_t *mc) +{ + return (mc->mc_name); +} + uint64_t metaslab_class_get_alloc(metaslab_class_t *mc) { return (mc->mc_alloc); } uint64_t metaslab_class_get_deferred(metaslab_class_t *mc) { return (mc->mc_deferred); } uint64_t metaslab_class_get_space(metaslab_class_t *mc) { return (mc->mc_space); } uint64_t metaslab_class_get_dspace(metaslab_class_t *mc) { return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } void metaslab_class_histogram_verify(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t *mc_hist; int i; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mc_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); mutex_enter(&mc->mc_lock); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = vdev_get_mg(tvd, mc); /* * Skip any holes, uninitialized top-levels, or * vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); } mutex_exit(&mc->mc_lock); kmem_free(mc_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE); } /* * Calculate the metaslab class's fragmentation metric. The metric * is weighted based on the space contribution of each metaslab group. * The return value will be a number between 0 and 100 (inclusive), or * ZFS_FRAG_INVALID if the metric has not been set. See comment above the * zfs_frag_table for more information about the metric. */ uint64_t metaslab_class_fragmentation(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t fragmentation = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; /* * Skip any holes, uninitialized top-levels, * or vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * If a metaslab group does not contain a fragmentation * metric then just bail out. */ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (ZFS_FRAG_INVALID); } /* * Determine how much this metaslab_group is contributing * to the overall pool fragmentation metric. */ fragmentation += mg->mg_fragmentation * metaslab_group_get_space(mg); } fragmentation /= metaslab_class_get_space(mc); ASSERT3U(fragmentation, <=, 100); spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (fragmentation); } /* * Calculate the amount of expandable space that is available in * this metaslab class. If a device is expanded then its expandable * space will be the amount of allocatable space that is currently not * part of this metaslab class. */ uint64_t metaslab_class_expandable_space(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t space = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * Calculate if we have enough space to add additional * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. */ space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, 1ULL << tvd->vdev_ms_shift, uint64_t); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); } void metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; uint64_t now = gethrestime_sec(); /* Round delay up to next second. */ uint_t delay = (metaslab_unload_delay_ms + 999) / 1000; for (int i = 0; i < multilist_get_num_sublists(ml); i++) { multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL) { mutex_enter(&msp->ms_lock); /* * If the metaslab has been removed from the list * (which could happen if we were at the memory limit * and it was evicted during this loop), then we can't * proceed and we should restart the sublist. */ if (!multilist_link_active(&msp->ms_class_txg_node)) { mutex_exit(&msp->ms_lock); i--; break; } mls = multilist_sublist_lock_idx(ml, i); metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); if (txg > msp->ms_selected_txg + metaslab_unload_delay && now > msp->ms_selected_time + delay && (msp->ms_allocator == -1 || !metaslab_preload_enabled)) { metaslab_evict(msp, txg); } else { /* * Once we've hit a metaslab selected too * recently to evict, we're done evicting for * now. */ mutex_exit(&msp->ms_lock); break; } mutex_exit(&msp->ms_lock); msp = next_msp; } } } static int metaslab_compare(const void *x1, const void *x2) { const metaslab_t *m1 = (const metaslab_t *)x1; const metaslab_t *m2 = (const metaslab_t *)x2; int sort1 = 0; int sort2 = 0; if (m1->ms_allocator != -1 && m1->ms_primary) sort1 = 1; else if (m1->ms_allocator != -1 && !m1->ms_primary) sort1 = 2; if (m2->ms_allocator != -1 && m2->ms_primary) sort2 = 1; else if (m2->ms_allocator != -1 && !m2->ms_primary) sort2 = 2; /* * Sort inactive metaslabs first, then primaries, then secondaries. When * selecting a metaslab to allocate from, an allocator first tries its * primary, then secondary active metaslab. If it doesn't have active * metaslabs, or can't allocate from them, it searches for an inactive * metaslab to activate. If it can't find a suitable one, it will steal * a primary or secondary metaslab from another allocator. */ if (sort1 < sort2) return (-1); if (sort1 > sort2) return (1); int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); return (TREE_CMP(m1->ms_start, m2->ms_start)); } /* * ========================================================================== * Metaslab groups * ========================================================================== */ /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below * the zfs_mg_noalloc_threshold or has a fragmentation value that is * greater than zfs_mg_fragmentation_threshold. If a metaslab group * transitions from allocatable to non-allocatable or vice versa then the * metaslab group's class is updated to reflect the transition. */ static void metaslab_group_alloc_update(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; metaslab_class_t *mc = mg->mg_class; vdev_stat_t *vs = &vd->vdev_stat; boolean_t was_allocatable; boolean_t was_initialized; ASSERT(vd == vd->vdev_top); ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, SCL_ALLOC); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; was_initialized = mg->mg_initialized; uint64_t free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); mutex_enter(&mc->mc_lock); /* * If the metaslab group was just added then it won't * have any space until we finish syncing out this txg. * At that point we will consider it initialized and available * for allocations. We also don't consider non-activated * metaslab groups (e.g. vdevs that are in the middle of being removed) * to be initialized, because they can't be used for allocation. */ mg->mg_initialized = metaslab_group_initialized(mg); if (!was_initialized && mg->mg_initialized) { mc->mc_groups++; } else if (was_initialized && !mg->mg_initialized) { ASSERT3U(mc->mc_groups, >, 0); mc->mc_groups--; } if (mg->mg_initialized) mg->mg_no_free_space = B_FALSE; /* * A metaslab group is considered allocatable if it has plenty * of free space or is not heavily fragmented. We only take * fragmentation into account if the metaslab group has a valid * fragmentation metric (i.e. a value between 0 and 100). */ mg->mg_allocatable = (mg->mg_activation_count > 0 && free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); /* * The mc_alloc_groups maintains a count of the number of * groups in this metaslab class that are still above the * zfs_mg_noalloc_threshold. This is used by the allocating * threads to determine if they should avoid allocations to * a given group. The allocator will avoid allocations to a group * if that group has reached or is below the zfs_mg_noalloc_threshold * and there are still other groups that are above the threshold. * When a group transitions from allocatable to non-allocatable or * vice versa we update the metaslab class to reflect that change. * When the mc_alloc_groups value drops to 0 that means that all * groups have reached the zfs_mg_noalloc_threshold making all groups * eligible for allocations. This effectively means that all devices * are balanced again. */ if (was_allocatable && !mg->mg_allocatable) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } int metaslab_sort_by_flushed(const void *va, const void *vb) { const metaslab_t *a = va; const metaslab_t *b = vb; int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); if (likely(cmp)) return (cmp); uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; cmp = TREE_CMP(a_vdev_id, b_vdev_id); if (cmp) return (cmp); return (TREE_CMP(a->ms_id, b->ms_id)); } metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) { spa_t *spa = mc->mc_spa; metaslab_group_t *mg; mg = kmem_zalloc(offsetof(metaslab_group_t, mg_allocator[spa->spa_alloc_count]), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; mg->mg_initialized = B_FALSE; mg->mg_no_free_space = B_TRUE; for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_create_tracked(&mga->mga_queue_depth); } return (mg); } void metaslab_group_destroy(metaslab_group_t *mg) { spa_t *spa = mg->mg_class->mc_spa; ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); /* * We may have gone below zero with the activation count * either because we never activated in the first place or * because we're done, and possibly removing the vdev. */ ASSERT(mg->mg_activation_count <= 0); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_disabled_lock); cv_destroy(&mg->mg_ms_disabled_cv); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_destroy(&mga->mga_queue_depth); } kmem_free(mg, offsetof(metaslab_group_t, mg_allocator[spa->spa_alloc_count])); } void metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count <= 0); if (++mg->mg_activation_count <= 0) return; metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else { mgnext = mgprev->mg_next; mg->mg_prev = mgprev; mg->mg_next = mgnext; mgprev->mg_next = mg; mgnext->mg_prev = mg; } for (int i = 0; i < spa->spa_alloc_count; i++) { mc->mc_allocator[i].mca_rotor = mg; mg = mg->mg_next; } metaslab_class_balance(mc, B_FALSE); } /* * Passivate a metaslab group and remove it from the allocation rotor. * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating * a metaslab group. This function will momentarily drop spa_config_locks * that are lower than the SCL_ALLOC lock (see comment below). */ void metaslab_group_passivate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { for (int i = 0; i < spa->spa_alloc_count; i++) ASSERT(mc->mc_allocator[i].mca_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count < 0); return; } /* * The spa_config_lock is an array of rwlocks, ordered as * follows (from highest to lowest): * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > * SCL_ZIO > SCL_FREE > SCL_VDEV * (For more information about the spa_config_lock see spa_misc.c) * The higher the lock, the broader its coverage. When we passivate * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO * config locks. However, the metaslab group's taskq might be trying * to preload metaslabs so we must drop the SCL_ZIO lock and any * lower locks to allow the I/O to complete. At a minimum, * we continue to hold the SCL_ALLOC lock, which prevents any future * allocations from taking place and any changes to the vdev tree. */ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; metaslab_t *msp = mga->mga_primary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } msp = mga->mga_secondary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } } mgprev = mg->mg_prev; mgnext = mg->mg_next; if (mg == mgnext) { mgnext = NULL; } else { mgprev->mg_next = mgnext; mgnext->mg_prev = mgprev; } for (int i = 0; i < spa->spa_alloc_count; i++) { if (mc->mc_allocator[i].mca_rotor == mg) mc->mc_allocator[i].mca_rotor = mgnext; } mg->mg_prev = NULL; mg->mg_next = NULL; metaslab_class_balance(mc, B_FALSE); } boolean_t metaslab_group_initialized(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; vdev_stat_t *vs = &vd->vdev_stat; return (vs->vs_space != 0 && mg->mg_activation_count > 0); } uint64_t metaslab_group_get_space(metaslab_group_t *mg) { /* * Note that the number of nodes in mg_metaslab_tree may be one less * than vdev_ms_count, due to the embedded log metaslab. */ mutex_enter(&mg->mg_lock); uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); mutex_exit(&mg->mg_lock); return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); } void metaslab_group_histogram_verify(metaslab_group_t *mg) { uint64_t *mg_hist; avl_tree_t *t = &mg->mg_metaslab_tree; uint64_t ashift = mg->mg_vd->vdev_ashift; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mg_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); ASSERT3U(ZFS_RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); mutex_enter(&mg->mg_lock); for (metaslab_t *msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { VERIFY3P(msp->ms_group, ==, mg); /* skip if not active */ if (msp->ms_sm == NULL) continue; for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { mg_hist[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } } for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); mutex_exit(&mg->mg_lock); kmem_free(mg_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE); } static void metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } void metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { ASSERT3U(mg->mg_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { ASSERT(msp->ms_group == NULL); mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); mutex_enter(&msp->ms_lock); metaslab_group_histogram_add(mg, msp); mutex_exit(&msp->ms_lock); } static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { mutex_enter(&msp->ms_lock); metaslab_group_histogram_remove(mg, msp); mutex_exit(&msp->ms_lock); mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); msp->ms_group = NULL; mutex_exit(&mg->mg_lock); } static void metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&mg->mg_lock)); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_weight = weight; avl_add(&mg->mg_metaslab_tree, msp); } static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { /* * Although in principle the weight can be any value, in * practice we do not use values in the range [1, 511]. */ ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } /* * Calculate the fragmentation for a given metaslab group. Weight metaslabs * on the amount of free space. The return value will be between 0 and 100 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this * group have a fragmentation metric. */ uint64_t metaslab_group_fragmentation(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; uint64_t fragmentation = 0; uint64_t valid_ms = 0, total_ms = 0; uint64_t free, total_free = 0; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp->ms_group != mg) continue; total_ms++; if (msp->ms_fragmentation == ZFS_FRAG_INVALID) continue; valid_ms++; free = (msp->ms_size - metaslab_allocated_space(msp)) / SPA_MINBLOCKSIZE; /* To prevent overflows. */ total_free += free; fragmentation += msp->ms_fragmentation * free; } if (valid_ms < (total_ms + 1) / 2 || total_free == 0) return (ZFS_FRAG_INVALID); fragmentation /= total_free; ASSERT3U(fragmentation, <=, 100); return (fragmentation); } /* * ========================================================================== * Range tree callbacks * ========================================================================== */ /* * Comparison function for the private size-ordered tree using 32-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ __attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { const zfs_range_seg32_t *r1 = x1; const zfs_range_seg32_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } /* * Comparison function for the private size-ordered tree using 64-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ __attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { const zfs_range_seg64_t *r1 = x1; const zfs_range_seg64_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } typedef struct metaslab_rt_arg { zfs_btree_t *mra_bt; uint32_t mra_floor_shift; } metaslab_rt_arg_t; struct mssa_arg { zfs_range_tree_t *rt; metaslab_rt_arg_t *mra; }; static void metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) { struct mssa_arg *mssap = arg; zfs_range_tree_t *rt = mssap->rt; metaslab_rt_arg_t *mrap = mssap->mra; zfs_range_seg_max_t seg = {0}; zfs_rs_set_start(&seg, rt, start); zfs_rs_set_end(&seg, rt, start + size); metaslab_rt_add(rt, &seg, mrap); } static void metaslab_size_tree_full_load(zfs_range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; METASLABSTAT_BUMP(metaslabstat_reload_tree); ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; arg.rt = rt; arg.mra = mrap; zfs_range_tree_walk(rt, metaslab_size_sorted_add, &arg); } ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, zfs_range_seg32_t, metaslab_rangesize32_compare) ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, zfs_range_seg64_t, metaslab_rangesize64_compare) /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered zfs_range_tree_t and an array of * uint64_t's. */ static void metaslab_rt_create(zfs_range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; size_t size; int (*compare) (const void *, const void *); bt_find_in_buf_f bt_find; switch (rt->rt_type) { case ZFS_RANGE_SEG32: size = sizeof (zfs_range_seg32_t); compare = metaslab_rangesize32_compare; bt_find = metaslab_rt_find_rangesize32_in_buf; break; case ZFS_RANGE_SEG64: size = sizeof (zfs_range_seg64_t); compare = metaslab_rangesize64_compare; bt_find = metaslab_rt_find_rangesize64_in_buf; break; default: panic("Invalid range seg type %d", rt->rt_type); } zfs_btree_create(size_tree, compare, bt_find, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } static void metaslab_rt_destroy(zfs_range_tree_t *rt, void *arg) { (void) rt; metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; zfs_btree_destroy(size_tree); kmem_free(mrap, sizeof (*mrap)); } static void metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; zfs_btree_add(size_tree, rs); } static void metaslab_rt_remove(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; zfs_btree_remove(size_tree, rs); } static void metaslab_rt_vacate(zfs_range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; zfs_btree_clear(size_tree); zfs_btree_destroy(size_tree); metaslab_rt_create(rt, arg); } static const zfs_range_tree_ops_t metaslab_rt_ops = { .rtop_create = metaslab_rt_create, .rtop_destroy = metaslab_rt_destroy, .rtop_add = metaslab_rt_add, .rtop_remove = metaslab_rt_remove, .rtop_vacate = metaslab_rt_vacate }; /* * ========================================================================== * Common allocator routines * ========================================================================== */ /* * Return the maximum contiguous segment within the metaslab. */ uint64_t metaslab_largest_allocatable(metaslab_t *msp) { zfs_btree_t *t = &msp->ms_allocatable_by_size; zfs_range_seg_t *rs; if (t == NULL) return (0); if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); rs = zfs_btree_last(t, NULL); if (rs == NULL) return (0); return (zfs_rs_get_end(rs, msp->ms_allocatable) - zfs_rs_get_start(rs, msp->ms_allocatable)); } /* * Return the maximum contiguous segment within the unflushed frees of this * metaslab. */ static uint64_t metaslab_largest_unflushed_free(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_unflushed_frees == NULL) return (0); if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) metaslab_size_tree_full_load(msp->ms_unflushed_frees); zfs_range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, NULL); if (rs == NULL) return (0); /* * When a range is freed from the metaslab, that range is added to * both the unflushed frees and the deferred frees. While the block * will eventually be usable, if the metaslab were loaded the range * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE * txgs had passed. As a result, when attempting to estimate an upper * bound for the largest currently-usable free segment in the * metaslab, we need to not consider any ranges currently in the defer * trees. This algorithm approximates the largest available chunk in * the largest range in the unflushed_frees tree by taking the first * chunk. While this may be a poor estimate, it should only remain so * briefly and should eventually self-correct as frees are no longer * deferred. Similar logic applies to the ms_freed tree. See * metaslab_load() for more details. * * There are two primary sources of inaccuracy in this estimate. Both * are tolerated for performance reasons. The first source is that we * only check the largest segment for overlaps. Smaller segments may * have more favorable overlaps with the other trees, resulting in * larger usable chunks. Second, we only look at the first chunk in * the largest segment; there may be other usable chunks in the * largest segment, but we ignore them. */ uint64_t rstart = zfs_rs_get_start(rs, msp->ms_unflushed_frees); uint64_t rsize = zfs_rs_get_end(rs, msp->ms_unflushed_frees) - rstart; for (int t = 0; t < TXG_DEFER_SIZE; t++) { uint64_t start = 0; uint64_t size = 0; boolean_t found = zfs_range_tree_find_in(msp->ms_defer[t], rstart, rsize, &start, &size); if (found) { if (rstart == start) return (0); rsize = start - rstart; } } uint64_t start = 0; uint64_t size = 0; boolean_t found = zfs_range_tree_find_in(msp->ms_freed, rstart, rsize, &start, &size); if (found) rsize = start - rstart; return (rsize); } static zfs_range_seg_t * metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start, uint64_t size, uint64_t max_size, zfs_btree_index_t *where) { zfs_range_seg_t *rs; zfs_range_seg_max_t rsearch; zfs_rs_set_start(&rsearch, rt, start); zfs_rs_set_end(&rsearch, rt, start + max_size); rs = zfs_btree_find(t, &rsearch, where); if (rs == NULL) { if (size == max_size) { rs = zfs_btree_next(t, where, where); } else { /* * If we're searching for a range, get the largest * segment in that range, or the smallest one bigger * than it. */ rs = zfs_btree_prev(t, where, where); if (rs == NULL || zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < size) { rs = zfs_btree_next(t, where, where); } } } return (rs); } /* * This is a helper function that can be used by the allocator to find a * suitable block to allocate. This will search the specified B-tree looking * for a block that matches the specified criteria. */ static uint64_t metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size, uint64_t max_size, uint64_t max_search, uint64_t *found_size) { if (*cursor == 0) *cursor = rt->rt_start; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t where; zfs_range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, max_size, &where); uint64_t first_found; int count_searched = 0; if (rs != NULL) first_found = zfs_rs_get_start(rs, rt); while (rs != NULL && (zfs_rs_get_start(rs, rt) - first_found <= max_search || count_searched < metaslab_min_search_count)) { uint64_t offset = zfs_rs_get_start(rs, rt); if (offset + size <= zfs_rs_get_end(rs, rt)) { *found_size = MIN(zfs_rs_get_end(rs, rt) - offset, max_size); *cursor = offset + *found_size; return (offset); } rs = zfs_btree_next(bt, &where, &where); count_searched++; } *cursor = 0; *found_size = 0; return (-1ULL); } static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, uint64_t *found_size); static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, uint64_t *found_size); static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, uint64_t *found_size); metaslab_ops_t *metaslab_allocator(spa_t *spa); static metaslab_ops_t metaslab_allocators[] = { { "dynamic", metaslab_df_alloc }, { "cursor", metaslab_cf_alloc }, { "new-dynamic", metaslab_ndf_alloc }, }; static int spa_find_allocator_byname(const char *val) { int a = ARRAY_SIZE(metaslab_allocators) - 1; if (strcmp("new-dynamic", val) == 0) return (-1); /* remove when ndf is working */ for (; a >= 0; a--) { if (strcmp(val, metaslab_allocators[a].msop_name) == 0) return (a); } return (-1); } void spa_set_allocator(spa_t *spa, const char *allocator) { int a = spa_find_allocator_byname(allocator); if (a < 0) a = 0; spa->spa_active_allocator = a; zfs_dbgmsg("spa allocator: %s", metaslab_allocators[a].msop_name); } int spa_get_allocator(spa_t *spa) { return (spa->spa_active_allocator); } #if defined(_KERNEL) int param_set_active_allocator_common(const char *val) { char *p; if (val == NULL) return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; int a = spa_find_allocator_byname(val); if (a < 0) return (SET_ERROR(EINVAL)); zfs_active_allocator = metaslab_allocators[a].msop_name; return (0); } #endif metaslab_ops_t * metaslab_allocator(spa_t *spa) { int allocator = spa_get_allocator(spa); return (&metaslab_allocators[allocator]); } /* * ========================================================================== * Dynamic Fit (df) block allocator * * Search for a free chunk of at least this size, starting from the last * offset (for this alignment of block) looking for up to * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not * found within 16MB, then return a free chunk of exactly the requested size (or * larger). * * If it seems like searching from the last offset will be unproductive, skip * that and just return a free chunk of exactly the requested size (or larger). * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This * mechanism is probably not very useful and may be removed in the future. * * The behavior when not searching can be changed to return the largest free * chunk, instead of a free chunk of exactly the requested size, by setting * metaslab_df_use_largest_segment. * ========================================================================== */ static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, uint64_t *found_size) { /* * Find the largest power of 2 block size that evenly divides the * requested size. This is used to try to allocate blocks with similar * alignment from the same area of the metaslab (i.e. same cursor * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ uint64_t align = max_size & -max_size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; zfs_range_tree_t *rt = msp->ms_allocatable; uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size; uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If we're running low on space, find a segment based on size, * rather than iterating based on offset. */ if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { align = size & -size; cursor = &msp->ms_lbas[highbit64(align) - 1]; offset = -1; } else { offset = metaslab_block_picker(rt, cursor, size, max_size, metaslab_df_max_search, found_size); if (max_size != size && offset == -1) { align = size & -size; cursor = &msp->ms_lbas[highbit64(align) - 1]; offset = metaslab_block_picker(rt, cursor, size, max_size, metaslab_df_max_search, found_size); } } if (offset == -1) { zfs_range_seg_t *rs; if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); if (metaslab_df_use_largest_segment) { /* use largest free segment */ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); } else { zfs_btree_index_t where; /* use segment of this size, or next largest */ rs = metaslab_block_find(&msp->ms_allocatable_by_size, rt, msp->ms_start, size, max_size, &where); } if (rs != NULL && zfs_rs_get_start(rs, rt) + size <= zfs_rs_get_end(rs, rt)) { offset = zfs_rs_get_start(rs, rt); *found_size = MIN(zfs_rs_get_end(rs, rt) - offset, max_size); *cursor = offset + *found_size; } } return (offset); } /* * ========================================================================== * Cursor fit block allocator - * Select the largest region in the metaslab, set the cursor to the beginning * of the range and the cursor_end to the end of the range. As allocations * are made advance the cursor. Continue allocating from the cursor until * the range is exhausted and then find a new range. * ========================================================================== */ static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, uint64_t *found_size) { zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { zfs_range_seg_t *rs; if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); rs = zfs_btree_last(t, NULL); if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < size) return (-1ULL); *cursor = zfs_rs_get_start(rs, rt); *cursor_end = zfs_rs_get_end(rs, rt); } offset = *cursor; *found_size = MIN(*cursor_end - offset, max_size); *cursor = offset + *found_size; return (offset); } /* * ========================================================================== * New dynamic fit allocator - * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift * contiguous blocks. If no region is found then just use the largest segment * that remains. * ========================================================================== */ /* * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) * to request from the allocator. */ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, uint64_t *found_size) { zfs_btree_t *t = &msp->ms_allocatable->rt_root; zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_index_t where; zfs_range_seg_t *rs; zfs_range_seg_max_t rsearch; uint64_t hbit = highbit64(max_size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_possible_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); if (max_possible_size < size) return (-1ULL); zfs_rs_set_start(&rsearch, rt, *cursor); zfs_rs_set_end(&rsearch, rt, *cursor + max_size); rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < max_size) { hbit = highbit64(size); cursor = &msp->ms_lbas[hbit - 1]; zfs_rs_set_start(&rsearch, rt, *cursor); zfs_rs_set_end(&rsearch, rt, *cursor + size); rs = zfs_btree_find(t, &rsearch, &where); } if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; zfs_rs_set_start(&rsearch, rt, 0); zfs_rs_set_end(&rsearch, rt, MIN(max_possible_size, 1ULL << (hbit + metaslab_ndf_clump_shift))); rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL) rs = zfs_btree_next(t, &where, &where); ASSERT(rs != NULL); } if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) { *found_size = MIN(zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt), max_size); *cursor = zfs_rs_get_start(rs, rt) + *found_size; return (zfs_rs_get_start(rs, rt)); } return (-1ULL); } /* * ========================================================================== * Metaslabs * ========================================================================== */ /* * Wait for any in-progress metaslab loads to complete. */ static void metaslab_load_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_loading) { ASSERT(!msp->ms_loaded); cv_wait(&msp->ms_load_cv, &msp->ms_lock); } } /* * Wait for any in-progress flushing to complete. */ static void metaslab_flush_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_flushing) cv_wait(&msp->ms_flush_cv, &msp->ms_lock); } static unsigned int metaslab_idx_func(multilist_t *ml, void *arg) { metaslab_t *msp = arg; /* * ms_id values are allocated sequentially, so full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml)); } uint64_t metaslab_allocated_space(metaslab_t *msp) { return (msp->ms_allocated_space); } /* * Verify that the space accounting on disk matches the in-core range_trees. */ static void metaslab_verify_space(metaslab_t *msp, uint64_t txg) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t allocating = 0; uint64_t sm_free_space, msp_free_space; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!msp->ms_condensing); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can only verify the metaslab space when we're called * from syncing context with a loaded metaslab that has an * allocated space map. Calling this in non-syncing context * does not provide a consistent view of the metaslab since * we're performing allocations in the future. */ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || !msp->ms_loaded) return; /* * Even though the smp_alloc field can get negative, * when it comes to a metaslab's space map, that should * never be the case. */ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); ASSERT3U(space_map_allocated(msp->ms_sm), >=, zfs_range_tree_space(msp->ms_unflushed_frees)); ASSERT3U(metaslab_allocated_space(msp), ==, space_map_allocated(msp->ms_sm) + zfs_range_tree_space(msp->ms_unflushed_allocs) - zfs_range_tree_space(msp->ms_unflushed_frees)); sm_free_space = msp->ms_size - metaslab_allocated_space(msp); /* * Account for future allocations since we would have * already deducted that space from the ms_allocatable. */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { allocating += zfs_range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, msp->ms_allocating_total); ASSERT3U(msp->ms_deferspace, ==, zfs_range_tree_space(msp->ms_defer[0]) + zfs_range_tree_space(msp->ms_defer[1])); msp_free_space = zfs_range_tree_space(msp->ms_allocatable) + allocating + msp->ms_deferspace + zfs_range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); } static void metaslab_aux_histograms_clear(metaslab_t *msp) { /* * Auxiliary histograms are only cleared when resetting them, * which can only happen while the metaslab is loaded. */ ASSERT(msp->ms_loaded); memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); for (int t = 0; t < TXG_DEFER_SIZE; t++) memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t])); } static void metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, zfs_range_tree_t *rt) { /* * This is modeled after space_map_histogram_add(), so refer to that * function for implementation details. We want this to work like * the space map histogram, and not the range tree histogram, as we * are essentially constructing a delta that will be later subtracted * from the space map histogram. */ int idx = 0; for (int i = shift; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT3U(i, >=, idx + shift); histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { ASSERT3U(idx + shift, ==, i); idx++; ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); } } } /* * Called at every sync pass that the metaslab gets synced. * * The reason is that we want our auxiliary histograms to be updated * wherever the metaslab's space map histogram is updated. This way * we stay consistent on which parts of the metaslab space map's * histogram are currently not available for allocations (e.g because * they are in the defer, freed, and freeing trees). */ static void metaslab_aux_histograms_update(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(sm != NULL); /* * This is similar to the metaslab's space map histogram updates * that take place in metaslab_sync(). The only difference is that * we only care about segments that haven't made it into the * ms_allocatable tree yet. */ if (msp->ms_loaded) { metaslab_aux_histograms_clear(msp); metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freed); for (int t = 0; t < TXG_DEFER_SIZE; t++) { metaslab_aux_histogram_add(msp->ms_deferhist[t], sm->sm_shift, msp->ms_defer[t]); } } metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freeing); } /* * Called every time we are done syncing (writing to) the metaslab, * i.e. at the end of each sync pass. * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] */ static void metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; space_map_t *sm = msp->ms_sm; if (sm == NULL) { /* * We came here from metaslab_init() when creating/opening a * pool, looking at a metaslab that hasn't had any allocations * yet. */ return; } /* * This is similar to the actions that we take for the ms_freed * and ms_defer trees in metaslab_sync_done(). */ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; if (defer_allowed) { memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist, sizeof (msp->ms_synchist)); } else { memset(msp->ms_deferhist[hist_index], 0, sizeof (msp->ms_deferhist[hist_index])); } memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); } /* * Ensure that the metaslab's weight and fragmentation are consistent * with the contents of the histogram (either the range tree's histogram * or the space map's depending whether the metaslab is loaded). */ static void metaslab_verify_weight_and_frag(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can end up here from vdev_remove_complete(), in which case we * cannot do these assertions because we hold spa config locks and * thus we are not allowed to read from the DMU. * * We check if the metaslab group has been removed and if that's * the case we return immediately as that would mean that we are * here from the aforementioned code path. */ if (msp->ms_group == NULL) return; /* * Devices being removed always return a weight of 0 and leave * fragmentation and ms_max_size as is - there is nothing for * us to verify here. */ vdev_t *vd = msp->ms_group->mg_vd; if (vd->vdev_removing) return; /* * If the metaslab is dirty it probably means that we've done * some allocations or frees that have changed our histograms * and thus the weight. */ for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&vd->vdev_ms_list, msp, t)) return; } /* * This verification checks that our in-memory state is consistent * with what's on disk. If the pool is read-only then there aren't * any changes and we just have the initially-loaded state. */ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) return; /* some extra verification for in-core tree if you can */ if (msp->ms_loaded) { zfs_range_tree_stat_verify(msp->ms_allocatable); VERIFY(space_map_histogram_verify(msp->ms_sm, msp->ms_allocatable)); } uint64_t weight = msp->ms_weight; uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); uint64_t frag = msp->ms_fragmentation; uint64_t max_segsize = msp->ms_max_size; msp->ms_weight = 0; msp->ms_fragmentation = 0; /* * This function is used for verification purposes and thus should * not introduce any side-effects/mutations on the system's state. * * Regardless of whether metaslab_weight() thinks this metaslab * should be active or not, we want to ensure that the actual weight * (and therefore the value of ms_weight) would be the same if it * was to be recalculated at this point. * * In addition we set the nodirty flag so metaslab_weight() does * not dirty the metaslab for future TXGs (e.g. when trying to * force condensing to upgrade the metaslab spacemaps). */ msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; VERIFY3U(max_segsize, ==, msp->ms_max_size); /* * If the weight type changed then there is no point in doing * verification. Revert fields to their original values. */ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { msp->ms_fragmentation = frag; msp->ms_weight = weight; return; } VERIFY3U(msp->ms_fragmentation, ==, frag); VERIFY3U(msp->ms_weight, ==, weight); } /* * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from * this class that was used longest ago, and attempt to unload it. We don't * want to spend too much time in this loop to prevent performance * degradation, and we expect that most of the time this operation will * succeed. Between that and the normal unloading processing during txg sync, * we expect this to keep the metaslab memory usage under control. */ static void metaslab_potentially_evict(metaslab_class_t *mc) { #ifdef _KERNEL uint64_t allmem = arc_all_memory(); uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache); uint_t tries = 0; for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; tries++) { unsigned int idx = multilist_get_random_index( &mc->mc_metaslab_txg_list); multilist_sublist_t *mls = multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < inuse * size) { VERIFY3P(mls, ==, multilist_sublist_lock_idx( &mc->mc_metaslab_txg_list, idx)); ASSERT3U(idx, ==, metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); if (!multilist_link_active(&msp->ms_class_txg_node)) { multilist_sublist_unlock(mls); break; } metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); /* * If the metaslab is currently loading there are two * cases. If it's the metaslab we're evicting, we * can't continue on or we'll panic when we attempt to * recursively lock the mutex. If it's another * metaslab that's loading, it can be safely skipped, * since we know it's very new and therefore not a * good eviction candidate. We check later once the * lock is held that the metaslab is fully loaded * before actually unloading it. */ if (msp->ms_loading) { msp = next_msp; inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); continue; } /* * We can't unload metaslabs with no spacemap because * they're not ready to be unloaded yet. We can't * unload metaslabs with outstanding allocations * because doing so could cause the metaslab's weight * to decrease while it's unloaded, which violates an * invariant that we use to prevent unnecessary * loading. We also don't unload metaslabs that are * currently active because they are high-weight * metaslabs that are likely to be used in the near * future. */ mutex_enter(&msp->ms_lock); if (msp->ms_allocator == -1 && msp->ms_sm != NULL && msp->ms_allocating_total == 0) { metaslab_unload(msp); } mutex_exit(&msp->ms_lock); msp = next_msp; inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); } } #else (void) mc, (void) zfs_metaslab_mem_limit; #endif } static int metaslab_load_impl(metaslab_t *msp) { int error = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We temporarily drop the lock to unblock other operations while we * are reading the space map. Therefore, metaslab_sync() and * metaslab_sync_done() can run at the same time as we do. * * If we are using the log space maps, metaslab_sync() can't write to * the metaslab's space map while we are loading as we only write to * it when we are flushing the metaslab, and that can't happen while * we are loading it. * * If we are not using log space maps though, metaslab_sync() can * append to the space map while we are loading. Therefore we load * only entries that existed when we started the load. Additionally, * metaslab_sync_done() has to wait for the load to complete because * there are potential races like metaslab_load() loading parts of the * space map that are currently being appended by metaslab_sync(). If * we didn't, the ms_allocatable would have entries that * metaslab_sync_done() would try to re-add later. * * That's why before dropping the lock we remember the synced length * of the metaslab and read up to that point of the space map, * ignoring entries appended by metaslab_sync() that happen after we * drop the lock. */ uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); hrtime_t load_start = gethrtime(); metaslab_rt_arg_t *mrap; if (msp->ms_allocatable->rt_arg == NULL) { mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); } else { mrap = msp->ms_allocatable->rt_arg; msp->ms_allocatable->rt_ops = NULL; msp->ms_allocatable->rt_arg = NULL; } mrap->mra_bt = &msp->ms_allocatable_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); /* Now, populate the size-sorted tree. */ metaslab_rt_create(msp->ms_allocatable, mrap); msp->ms_allocatable->rt_ops = &metaslab_rt_ops; msp->ms_allocatable->rt_arg = mrap; struct mssa_arg arg = {0}; arg.rt = msp->ms_allocatable; arg.mra = mrap; zfs_range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, &arg); } else { /* * Add the size-sorted tree first, since we don't need to load * the metaslab from the spacemap. */ metaslab_rt_create(msp->ms_allocatable, mrap); msp->ms_allocatable->rt_ops = &metaslab_rt_ops; msp->ms_allocatable->rt_arg = mrap; /* * The space map has not been allocated yet, so treat * all the space in the metaslab as free and add it to the * ms_allocatable tree. */ zfs_range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); if (msp->ms_new) { /* * If the ms_sm doesn't exist, this means that this * metaslab hasn't gone through metaslab_sync() and * thus has never been dirtied. So we shouldn't * expect any unflushed allocs or frees from previous * TXGs. */ ASSERT(zfs_range_tree_is_empty( msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty( msp->ms_unflushed_frees)); } } /* * We need to grab the ms_sync_lock to prevent metaslab_sync() from * changing the ms_sm (or log_sm) and the metaslab's range trees * while we are about to use them and populate the ms_allocatable. * The ms_lock is insufficient for this because metaslab_sync() doesn't * hold the ms_lock while writing the ms_checkpointing tree to disk. */ mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); ASSERT(!msp->ms_condensing); ASSERT(!msp->ms_flushing); if (error != 0) { mutex_exit(&msp->ms_sync_lock); return (error); } ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; /* * Apply all the unflushed changes to ms_allocatable right * away so any manipulations we do below have a clear view * of what is allocated and what is free. */ zfs_range_tree_walk(msp->ms_unflushed_allocs, zfs_range_tree_remove, msp->ms_allocatable); zfs_range_tree_walk(msp->ms_unflushed_frees, zfs_range_tree_add, msp->ms_allocatable); ASSERT3P(msp->ms_group, !=, NULL); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (spa_syncing_log_sm(spa) != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); /* * If we use a log space map we add all the segments * that are in ms_unflushed_frees so they are available * for allocation. * * ms_allocatable needs to contain all free segments * that are ready for allocations (thus not segments * from ms_freeing, ms_freed, and the ms_defer trees). * But if we grab the lock in this code path at a sync * pass later that 1, then it also contains the * segments of ms_freed (they were added to it earlier * in this path through ms_unflushed_frees). So we * need to remove all the segments that exist in * ms_freed from ms_allocatable as they will be added * later in metaslab_sync_done(). * * When there's no log space map, the ms_allocatable * correctly doesn't contain any segments that exist * in ms_freed [see ms_synced_length]. */ zfs_range_tree_walk(msp->ms_freed, zfs_range_tree_remove, msp->ms_allocatable); } /* * If we are not using the log space map, ms_allocatable * contains the segments that exist in the ms_defer trees * [see ms_synced_length]. Thus we need to remove them * from ms_allocatable as they will be added again in * metaslab_sync_done(). * * If we are using the log space map, ms_allocatable still * contains the segments that exist in the ms_defer trees. * Not because it read them through the ms_sm though. But * because these segments are part of ms_unflushed_frees * whose segments we add to ms_allocatable earlier in this * code path. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_walk(msp->ms_defer[t], zfs_range_tree_remove, msp->ms_allocatable); } /* * Call metaslab_recalculate_weight_and_sort() now that the * metaslab is loaded so we get the metaslab's real weight. * * Unless this metaslab was created with older software and * has not yet been converted to use segment-based weight, we * expect the new weight to be better or equal to the weight * that the metaslab had while it was not loaded. This is * because the old weight does not take into account the * consolidation of adjacent segments between TXGs. [see * comment for ms_synchist and ms_deferhist[] for more info] */ uint64_t weight = msp->ms_weight; uint64_t max_size = msp->ms_max_size; metaslab_recalculate_weight_and_sort(msp); if (!WEIGHT_IS_SPACEBASED(weight)) ASSERT3U(weight, <=, msp->ms_weight); msp->ms_max_size = metaslab_largest_allocatable(msp); ASSERT3U(max_size, <=, msp->ms_max_size); hrtime_t load_end = gethrtime(); msp->ms_load_time = load_end; - zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, " + zfs_dbgmsg("metaslab_load: txg %llu, spa %s, class %s, vdev_id %llu, " "ms_id %llu, smp_length %llu, " "unflushed_allocs %llu, unflushed_frees %llu, " "freed %llu, defer %llu + %llu, unloaded time %llu ms, " "loading_time %lld ms, ms_max_size %llu, " "max size error %lld, " "old_weight %llx, new_weight %llx", (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), + msp->ms_group->mg_class->mc_name, (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)space_map_length(msp->ms_sm), (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_allocs), (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_frees), (u_longlong_t)zfs_range_tree_space(msp->ms_freed), (u_longlong_t)zfs_range_tree_space(msp->ms_defer[0]), (u_longlong_t)zfs_range_tree_space(msp->ms_defer[1]), (longlong_t)((load_start - msp->ms_unload_time) / 1000000), (longlong_t)((load_end - load_start) / 1000000), (u_longlong_t)msp->ms_max_size, (u_longlong_t)msp->ms_max_size - max_size, (u_longlong_t)weight, (u_longlong_t)msp->ms_weight); metaslab_verify_space(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_sync_lock); return (0); } int metaslab_load(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * There may be another thread loading the same metaslab, if that's * the case just wait until the other thread is done and return. */ metaslab_load_wait(msp); if (msp->ms_loaded) return (0); VERIFY(!msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We set the loading flag BEFORE potentially dropping the lock to * wait for an ongoing flush (see ms_flushing below). This way other * threads know that there is already a thread that is loading this * metaslab. */ msp->ms_loading = B_TRUE; /* * Wait for any in-progress flushing to finish as we drop the ms_lock * both here (during space_map_load()) and in metaslab_flush() (when * we flush our changes to the ms_sm). */ if (msp->ms_flushing) metaslab_flush_wait(msp); /* * In the possibility that we were waiting for the metaslab to be * flushed (where we temporarily dropped the ms_lock), ensure that * no one else loaded the metaslab somehow. */ ASSERT(!msp->ms_loaded); /* * If we're loading a metaslab in the normal class, consider evicting * another one to keep our memory usage under the limit defined by the * zfs_metaslab_mem_limit tunable. */ if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == msp->ms_group->mg_class) { metaslab_potentially_evict(msp->ms_group->mg_class); } int error = metaslab_load_impl(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); msp->ms_loading = B_FALSE; cv_broadcast(&msp->ms_load_cv); return (error); } void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * This can happen if a metaslab is selected for eviction (in * metaslab_potentially_evict) and then unloaded during spa_sync (via * metaslab_class_evict_old). */ if (!msp->ms_loaded) return; zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; msp->ms_unload_time = gethrtime(); msp->ms_activation_weight = 0; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; if (msp->ms_group != NULL) { metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, " - "ms_id %llu, weight %llx, " + zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, class %s, " + "vdev_id %llu, ms_id %llu, weight %llx, " "selected txg %llu (%llu s ago), alloc_txg %llu, " "loaded %llu ms ago, max_size %llu", (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), + msp->ms_group->mg_class->mc_name, (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_weight, (u_longlong_t)msp->ms_selected_txg, (u_longlong_t)(NSEC2SEC(msp->ms_unload_time) - msp->ms_selected_time), (u_longlong_t)msp->ms_alloc_txg, (u_longlong_t)(msp->ms_unload_time - msp->ms_load_time) / 1000 / 1000, (u_longlong_t)msp->ms_max_size); } /* * We explicitly recalculate the metaslab's weight based on its space * map (as it is now not loaded). We want unload metaslabs to always * have their weights calculated from the space map histograms, while * loaded ones have it calculated from their in-core range tree * [see metaslab_load()]. This way, the weight reflects the information * available in-core, whether it is loaded or not. * * If ms_group == NULL means that we came here from metaslab_fini(), * at which point it doesn't make sense for us to do the recalculation * and the sorting. */ if (msp->ms_group != NULL) metaslab_recalculate_weight_and_sort(msp); } /* * We want to optimize the memory use of the per-metaslab range * trees. To do this, we store the segments in the range trees in * units of sectors, zero-indexing from the start of the metaslab. If * the vdev_ms_shift - the vdev_ashift is less than 32, we can store * the ranges using two uint32_ts, rather than two uint64_ts. */ zfs_range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, uint64_t *start, uint64_t *shift) { if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && !zfs_metaslab_force_large_segs) { *shift = vdev->vdev_ashift; *start = msp->ms_start; return (ZFS_RANGE_SEG32); } else { *shift = 0; *start = 0; return (ZFS_RANGE_SEG64); } } void metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) { ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); msp->ms_selected_txg = txg; msp->ms_selected_time = gethrestime_sec(); multilist_sublist_insert_tail(mls, msp); multilist_sublist_unlock(mls); } void metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { vdev_space_update(vd, alloc_delta, defer_delta, space_delta); ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, vdev_deflated_space(vd, space_delta)); } int metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; metaslab_t *ms; int error; ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); multilist_link_init(&ms->ms_class_txg_node); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; ms->ms_allocator = -1; ms->ms_new = B_TRUE; vdev_ops_t *ops = vd->vdev_ops; if (ops->vdev_op_metaslab_init != NULL) ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. For * readonly pools there is no need to open the space map object. * * Note: * When called from vdev_expand(), we can't call into the DMU as * we are holding the spa_config_lock as a writer and we would * deadlock [see relevant comment in vdev_metaslab_init()]. in * that case, the object parameter is zero though, so we won't * call into the DMU. */ if (object != 0 && !(spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, ms->ms_size, vd->vdev_ashift); if (error != 0) { kmem_free(ms, sizeof (metaslab_t)); return (error); } ASSERT(ms->ms_sm != NULL); ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } uint64_t shift, start; zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_SIZE; t++) { ms->ms_allocating[t] = zfs_range_tree_create(NULL, type, NULL, start, shift); } ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift); ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL, start, shift); } ms->ms_checkpointing = zfs_range_tree_create(NULL, type, NULL, start, shift); ms->ms_unflushed_allocs = zfs_range_tree_create(NULL, type, NULL, start, shift); metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); mrap->mra_bt = &ms->ms_unflushed_frees_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops, type, mrap, start, shift); ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms, B_FALSE); /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. * If we're adding space to an existing pool, the new space * does not become available until after this txg has synced. * The metaslab's weight will also be initialized when we sync * out this txg. This ensures that we don't attempt to allocate * from it before we have initialized it completely. */ if (txg <= TXG_INITIAL) { metaslab_sync_done(ms, 0); metaslab_space_update(vd, mg->mg_class, metaslab_allocated_space(ms), 0, 0); } if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); vdev_dirty(vd, VDD_METASLAB, ms, txg); } *msp = ms; return (0); } static void metaslab_fini_flush_data(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (metaslab_unflushed_txg(msp) == 0) { ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, NULL); return; } ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp), metaslab_unflushed_dirty(msp)); } uint64_t metaslab_unflushed_changes_memused(metaslab_t *ms) { return ((zfs_range_tree_numsegs(ms->ms_unflushed_allocs) + zfs_range_tree_numsegs(ms->ms_unflushed_frees)) * ms->ms_unflushed_allocs->rt_root.bt_elem_size); } void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; metaslab_fini_flush_data(msp); metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); /* * If this metaslab hasn't been through metaslab_sync_done() yet its * space hasn't been accounted for in its vdev and doesn't need to be * subtracted. */ if (!msp->ms_new) { metaslab_space_update(vd, mg->mg_class, -metaslab_allocated_space(msp), 0, -msp->ms_size); } space_map_close(msp->ms_sm); msp->ms_sm = NULL; metaslab_unload(msp); zfs_range_tree_destroy(msp->ms_allocatable); zfs_range_tree_destroy(msp->ms_freeing); zfs_range_tree_destroy(msp->ms_freed); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); zfs_range_tree_destroy(msp->ms_unflushed_allocs); zfs_range_tree_destroy(msp->ms_checkpointing); zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); zfs_range_tree_destroy(msp->ms_unflushed_frees); for (int t = 0; t < TXG_SIZE; t++) { zfs_range_tree_destroy(msp->ms_allocating[t]); } for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); for (int t = 0; t < TXG_SIZE; t++) ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); zfs_range_tree_destroy(msp->ms_trim); mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); cv_destroy(&msp->ms_flush_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); kmem_free(msp, sizeof (metaslab_t)); } /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done * by calculating the space in each bucket of the spacemap histogram and * multiplying that by the fragmentation metric in this table. Doing * this for all buckets and dividing it by the total amount of free * space in this metaslab (i.e. the total free space in all buckets) gives * us the fragmentation metric. This means that a high fragmentation metric * equates to most of the free space being comprised of small segments. * Conversely, if the metric is low, then most of the free space is in * large segments. * * This table defines 0% fragmented space using 512M segments. Using this value, * we derive the rest of the table. This table originally went up to 16MB, but * with larger recordsizes, larger ashifts, and use of raidz3, it is possible * to have significantly larger allocations than were previously possible. * Since the fragmentation value is never stored on disk, it is possible to * change these calculations in the future. */ static const int zfs_frag_table[] = { 100, /* 512B */ 99, /* 1K */ 97, /* 2K */ 93, /* 4K */ 88, /* 8K */ 83, /* 16K */ 77, /* 32K */ 71, /* 64K */ 64, /* 128K */ 57, /* 256K */ 50, /* 512K */ 43, /* 1M */ 36, /* 2M */ 29, /* 4M */ 23, /* 8M */ 17, /* 16M */ 12, /* 32M */ 7, /* 64M */ 3, /* 128M */ 1, /* 256M */ 0, /* 512M */ }; #define FRAGMENTATION_TABLE_SIZE \ (sizeof (zfs_frag_table)/(sizeof (zfs_frag_table[0]))) /* * Calculate the metaslab's fragmentation metric and set ms_fragmentation. * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not * been upgraded and does not support this metric. Otherwise, the return * value should be in the range [0, 100]. */ static void metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t fragmentation = 0; uint64_t total = 0; boolean_t feature_enabled = spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM); if (!feature_enabled) { msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } /* * A null space map means that the entire metaslab is free * and thus is not fragmented. */ if (msp->ms_sm == NULL) { msp->ms_fragmentation = 0; return; } /* * If this metaslab's space map has not been upgraded, flag it * so that we upgrade next time we encounter it. */ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { uint64_t txg = spa_syncing_txg(spa); vdev_t *vd = msp->ms_group->mg_vd; /* * If we've reached the final dirty txg, then we must * be shutting down the pool. We don't want to dirty * any data past this point so skip setting the condense * flag. We can retry this action the next time the pool * is imported. We also skip marking this metaslab for * condensing if the caller has explicitly set nodirty. */ if (!nodirty && spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { msp->ms_condense_wanted = B_TRUE; vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); zfs_dbgmsg("txg %llu, requesting force condense: " "ms_id %llu, vdev_id %llu", (u_longlong_t)txg, (u_longlong_t)msp->ms_id, (u_longlong_t)vd->vdev_id); } msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { uint64_t space = 0; uint8_t shift = msp->ms_sm->sm_shift; int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, FRAGMENTATION_TABLE_SIZE - 1); if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) continue; space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); total += space; ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); fragmentation += space * zfs_frag_table[idx]; } if (total > 0) fragmentation /= total; ASSERT3U(fragmentation, <=, 100); msp->ms_fragmentation = fragmentation; } /* * Compute a weight -- a selection preference value -- for the given metaslab. * This is based on the amount of free space, the level of fragmentation, * the LBA range, and whether the metaslab is loaded. */ static uint64_t metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The baseline weight is the metaslab's free space. */ space = msp->ms_size - metaslab_allocated_space(msp); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { /* * Use the fragmentation information to inversely scale * down the baseline weight. We need to ensure that we * don't exclude this metaslab completely when it's 100% * fragmented. To avoid this we reduce the fragmented value * by 1. */ space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; /* * If space < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. The fragmentation metric may have * decreased the space to something smaller than * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE * so that we can consume any remaining space. */ if (space > 0 && space < SPA_MINBLOCKSIZE) space = SPA_MINBLOCKSIZE; } weight = space; /* * Modern disks have uniform bit density and constant angular velocity. * Therefore, the outer recording zones are faster (higher bandwidth) * than the inner zones by the ratio of outer to inner track diameter, * which is typically around 2:1. We account for this by assigning * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } /* * If this metaslab is one we're actively using, adjust its * weight to make it preferable to any inactive metaslab so * we'll polish it off. If the fragmentation on this metaslab * has exceed our threshold, then don't mark it active. */ if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } WEIGHT_SET_SPACEBASED(weight); return (weight); } /* * Return the weight of the specified metaslab, according to the segment-based * weighting algorithm. The metaslab must be loaded. This function can * be called within a sync pass since it relies only on the metaslab's * range tree which is always accurate when the metaslab is loaded. */ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp) { uint64_t weight = 0; uint32_t segments = 0; ASSERT(msp->ms_loaded); for (int i = ZFS_RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; i--) { uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; segments <<= 1; segments += msp->ms_allocatable->rt_histogram[i]; /* * The range tree provides more precision than the space map * and must be downgraded so that all values fit within the * space map's histogram. This allows us to compare loaded * vs. unloaded metaslabs to determine which metaslab is * considered "best". */ if (i > max_idx) continue; if (segments != 0) { WEIGHT_SET_COUNT(weight, segments); WEIGHT_SET_INDEX(weight, i); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Calculate the weight based on the on-disk histogram. Should be applied * only to unloaded metaslabs (i.e no incoming allocations) in-order to * give results consistent with the on-disk state */ static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(!msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(space_map_object(sm), !=, 0); ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * Create a joint histogram from all the segments that have made * it to the metaslab's space map histogram, that are not yet * available for allocation because they are still in the freeing * pipeline (e.g. freeing, freed, and defer trees). Then subtract * these segments from the space map's histogram to get a more * accurate weight. */ uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) deferspace_histogram[i] += msp->ms_synchist[i]; for (int t = 0; t < TXG_DEFER_SIZE; t++) { for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { deferspace_histogram[i] += msp->ms_deferhist[t][i]; } } uint64_t weight = 0; for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { ASSERT3U(sm->sm_phys->smp_histogram[i], >=, deferspace_histogram[i]); uint64_t count = sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; if (count != 0) { WEIGHT_SET_COUNT(weight, count); WEIGHT_SET_INDEX(weight, i + sm->sm_shift); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Compute a segment-based weight for the specified metaslab. The weight * is determined by highest bucket in the histogram. The information * for the highest bucket is encoded into the weight value. */ static uint64_t metaslab_segment_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; uint64_t weight = 0; uint8_t shift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The metaslab is completely free. */ if (metaslab_allocated_space(msp) == 0) { int idx = highbit64(msp->ms_size) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; if (idx < max_idx) { WEIGHT_SET_COUNT(weight, 1ULL); WEIGHT_SET_INDEX(weight, idx); } else { WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); WEIGHT_SET_INDEX(weight, max_idx); } WEIGHT_SET_ACTIVE(weight, 0); ASSERT(!WEIGHT_IS_SPACEBASED(weight)); return (weight); } ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * If the metaslab is fully allocated then just make the weight 0. */ if (metaslab_allocated_space(msp) == msp->ms_size) return (0); /* * If the metaslab is already loaded, then use the range tree to * determine the weight. Otherwise, we rely on the space map information * to generate the weight. */ if (msp->ms_loaded) { weight = metaslab_weight_from_range_tree(msp); } else { weight = metaslab_weight_from_spacemap(msp); } /* * If the metaslab was active the last time we calculated its weight * then keep it active. We want to consume the entire region that * is associated with this weight. */ if (msp->ms_activation_weight != 0 && weight != 0) WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); return (weight); } /* * Determine if we should attempt to allocate from this metaslab. If the * metaslab is loaded, then we can determine if the desired allocation * can be satisfied by looking at the size of the maximum free segment * on that metaslab. Otherwise, we make our decision based on the metaslab's * weight. For segment-based weighting we can determine the maximum * allocation based on the index encoded in its value. For space-based * weights we rely on the entire weight (excluding the weight-type bit). */ static boolean_t metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { /* * This case will usually but not always get caught by the checks below; * metaslabs can be loaded by various means, including the trim and * initialize code. Once that happens, without this check they are * allocatable even before they finish their first txg sync. */ if (unlikely(msp->ms_new)) return (B_FALSE); /* * If the metaslab is loaded, ms_max_size is definitive and we can use * the fast check. If it's not, the ms_max_size is a lower bound (once * set), and we should use the fast check as long as we're not in * try_hard and it's been less than zfs_metaslab_max_size_cache_sec * seconds since the metaslab was unloaded. */ if (msp->ms_loaded || (msp->ms_max_size != 0 && !try_hard && gethrtime() < msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) return (msp->ms_max_size >= asize); boolean_t should_allocate; if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { /* * The metaslab segment weight indicates segments in the * range [2^i, 2^(i+1)), where i is the index in the weight. * Since the asize might be in the middle of the range, we * should attempt the allocation if asize < 2^(i+1). */ should_allocate = (asize < 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); } else { should_allocate = (asize <= (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); } return (should_allocate); } static uint64_t metaslab_weight(metaslab_t *msp, boolean_t nodirty) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; uint64_t weight; ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_set_fragmentation(msp, nodirty); /* * Update the maximum size. If the metaslab is loaded, this will * ensure that we get an accurate maximum size if newly freed space * has been added back into the free tree. If the metaslab is * unloaded, we check if there's a larger free segment in the * unflushed frees. This is a lower bound on the largest allocatable * segment size. Coalescing of adjacent entries may reveal larger * allocatable segments, but we aren't aware of those until loading * the space map into a range tree. */ if (msp->ms_loaded) { msp->ms_max_size = metaslab_largest_allocatable(msp); } else { msp->ms_max_size = MAX(msp->ms_max_size, metaslab_largest_unflushed_free(msp)); } /* * Segment-based weighting requires space map histogram support. */ if (zfs_metaslab_segment_weight_enabled && spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == sizeof (space_map_phys_t))) { weight = metaslab_segment_weight(msp); } else { weight = metaslab_space_weight(msp); } return (weight); } void metaslab_recalculate_weight_and_sort(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* note: we preserve the mask (e.g. indication of primary, etc..) */ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(msp->ms_group, msp, metaslab_weight(msp, B_FALSE) | was_active); } static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If we're activating for the claim code, we don't want to actually * set the metaslab up for a specific allocator. */ if (activation_weight == METASLAB_WEIGHT_CLAIM) { ASSERT0(msp->ms_activation_weight); msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(mg, msp, msp->ms_weight | activation_weight); return (0); } metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ? &mga->mga_primary : &mga->mga_secondary); mutex_enter(&mg->mg_lock); if (*mspp != NULL) { mutex_exit(&mg->mg_lock); return (EEXIST); } *mspp = msp; ASSERT3S(msp->ms_allocator, ==, -1); msp->ms_allocator = allocator; msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); ASSERT0(msp->ms_activation_weight); msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort_impl(mg, msp, msp->ms_weight | activation_weight); mutex_exit(&mg->mg_lock); return (0); } static int metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The current metaslab is already activated for us so there * is nothing to do. Already activated though, doesn't mean * that this metaslab is activated for our allocator nor our * requested activation weight. The metaslab could have started * as an active one for our allocator but changed allocators * while we were waiting to grab its ms_lock or we stole it * [see find_valid_metaslab()]. This means that there is a * possibility of passivating a metaslab of another allocator * or from a different activation mask, from this thread. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { ASSERT(msp->ms_loaded); return (0); } int error = metaslab_load(msp); if (error != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } /* * When entering metaslab_load() we may have dropped the * ms_lock because we were loading this metaslab, or we * were waiting for another thread to load it for us. In * that scenario, we recheck the weight of the metaslab * to see if it was activated by another thread. * * If the metaslab was activated for another allocator or * it was activated with a different activation weight (e.g. * we wanted to make it a primary but it was activated as * secondary) we return error (EBUSY). * * If the metaslab was activated for the same allocator * and requested activation mask, skip activating it. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { if (msp->ms_allocator != allocator) return (EBUSY); if ((msp->ms_weight & activation_weight) == 0) return (SET_ERROR(EBUSY)); EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), msp->ms_primary); return (0); } /* * If the metaslab has literally 0 space, it will have weight 0. In * that case, don't bother activating it. This can happen if the * metaslab had space during find_valid_metaslab, but another thread * loaded it and used all that space while we were waiting to grab the * lock. */ if (msp->ms_weight == 0) { ASSERT0(zfs_range_tree_space(msp->ms_allocatable)); return (SET_ERROR(ENOSPC)); } if ((error = metaslab_activate_allocator(msp->ms_group, msp, allocator, activation_weight)) != 0) { return (error); } ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); return (0); } static void metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { metaslab_group_sort(mg, msp, weight); return; } mutex_enter(&mg->mg_lock); ASSERT3P(msp->ms_group, ==, mg); ASSERT3S(0, <=, msp->ms_allocator); ASSERT3U(msp->ms_allocator, <, mg->mg_class->mc_spa->spa_alloc_count); metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; if (msp->ms_primary) { ASSERT3P(mga->mga_primary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); mga->mga_primary = NULL; } else { ASSERT3P(mga->mga_secondary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); mga->mga_secondary = NULL; } msp->ms_allocator = -1; metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE; /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || size >= SPA_MINBLOCKSIZE || zfs_range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); ASSERT(msp->ms_activation_weight != 0); msp->ms_activation_weight = 0; metaslab_passivate_allocator(msp->ms_group, msp, weight); ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); } /* * Segment-based metaslabs are activated once and remain active until * we either fail an allocation attempt (similar to space-based metaslabs) * or have exhausted the free space in zfs_metaslab_switch_threshold * buckets since the metaslab was activated. This function checks to see * if we've exhausted the zfs_metaslab_switch_threshold buckets in the * metaslab and passivates it proactively. This will allow us to select a * metaslab with a larger contiguous region, if any, remaining within this * metaslab group. If we're in sync pass > 1, then we continue using this * metaslab so that we don't dirty more block and cause more sync passes. */ static void metaslab_segment_may_passivate(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) return; /* * As long as a single largest free segment covers majorioty of free * space, don't consider the metaslab fragmented. It should allow * us to fill new unfragmented metaslabs full before switching. */ if (metaslab_largest_allocatable(msp) > zfs_range_tree_space(msp->ms_allocatable) * 15 / 16) return; /* * Since we are in the middle of a sync pass, the most accurate * information that is accessible to us is the in-core range tree * histogram; calculate the new weight based on that information. */ uint64_t weight = metaslab_weight_from_range_tree(msp); int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); int current_idx = WEIGHT_GET_INDEX(weight); if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) metaslab_passivate(msp, weight); } static void metaslab_preload(void *arg) { metaslab_t *msp = arg; metaslab_class_t *mc = msp->ms_group->mg_class; spa_t *spa = mc->mc_spa; fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); (void) metaslab_load(msp); metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_lock); spl_fstrans_unmark(cookie); } static void metaslab_group_preload(metaslab_group_t *mg) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp; avl_tree_t *t = &mg->mg_metaslab_tree; int m = 0; if (spa_shutting_down(spa) || !metaslab_preload_enabled) return; mutex_enter(&mg->mg_lock); /* * Load the next potential metaslabs */ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { ASSERT3P(msp->ms_group, ==, mg); /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced * to condense then we preload it too. This will ensure * that force condensing happens in the next txg. */ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { continue; } VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, msp, TQ_SLEEP | (m <= spa->spa_alloc_count ? TQ_FRONT : 0)) != TASKQID_INVALID); } mutex_exit(&mg->mg_lock); } /* * Determine if the space map's on-disk footprint is past our tolerance for * inefficiency. We would like to use the following criteria to make our * decision: * * 1. Do not condense if the size of the space map object would dramatically * increase as a result of writing out the free space range tree. * * 2. Condense if the on on-disk space map representation is at least * zfs_condense_pct/100 times the size of the optimal representation * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). * * 3. Do not condense if the on-disk size of the space map does not actually * decrease. * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. * Instead, we apply the heuristic described in the block comment for * zfs_metaslab_condense_block_threshold - we only condense if the space used * is greater than a threshold number of blocks. */ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); /* * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ if (zfs_range_tree_numsegs(msp->ms_allocatable) == 0 || msp->ms_condense_wanted) return (B_TRUE); uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); uint64_t object_size = space_map_length(sm); uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } /* * Condense the on-disk space map representation to its minimized form. * The minimized form consists of a small number of allocations followed * by the entries of the free range tree (ms_allocatable). The condensed * spacemap contains all the entries of previous TXGs (including those in * the pool-wide log spacemaps; thus this is effectively a superset of * metaslab_flush()), but this TXG's entries still need to be written. */ static void metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) { zfs_range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(msp->ms_sm != NULL); /* * In order to condense the space map, we need to change it so it * only describes which segments are currently allocated and free. * * All the current free space resides in the ms_allocatable, all * the ms_defer trees, and all the ms_allocating trees. We ignore * ms_freed because it is empty because we're in sync pass 1. We * ignore ms_freeing because these changes are not yet reflected * in the spacemap (they will be written later this txg). * * So to truncate the space map to represent all the entries of * previous TXGs we do the following: * * 1] We create a range tree (condense tree) that is 100% empty. * 2] We add to it all segments found in the ms_defer trees * as those segments are marked as free in the original space * map. We do the same with the ms_allocating trees for the same * reason. Adding these segments should be a relatively * inexpensive operation since we expect these trees to have a * small number of nodes. * 3] We vacate any unflushed allocs, since they are not frees we * need to add to the condense tree. Then we vacate any * unflushed frees as they should already be part of ms_allocatable. * 4] At this point, we would ideally like to add all segments * in the ms_allocatable tree from the condense tree. This way * we would write all the entries of the condense tree as the * condensed space map, which would only contain freed * segments with everything else assumed to be allocated. * * Doing so can be prohibitively expensive as ms_allocatable can * be large, and therefore computationally expensive to add to * the condense_tree. Instead we first sync out an entry marking * everything as allocated, then the condense_tree and then the * ms_allocatable, in the condensed space map. While this is not * optimal, it is typically close to optimal and more importantly * much cheaper to compute. * * 5] Finally, as both of the unflushed trees were written to our * new and condensed metaslab space map, we basically flushed * all the unflushed changes to disk, thus we call * metaslab_flush_update(). */ ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " "spa %s, smp size %llu, segments %llu, forcing condense=%s", (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp, (u_longlong_t)msp->ms_group->mg_vd->vdev_id, spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm), (u_longlong_t)zfs_range_tree_numsegs(msp->ms_allocatable), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; zfs_range_seg_type_t type; uint64_t shift, start; type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, &start, &shift); condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_walk(msp->ms_defer[t], zfs_range_tree_add, condense_tree); } for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { zfs_range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], zfs_range_tree_add, condense_tree); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); /* * We're about to drop the metaslab's lock thus allowing other * consumers to change it's content. Set the metaslab's ms_condensing * flag to ensure that allocations on this metaslab do not occur * while we're in the middle of committing it to disk. This is only * critical for ms_allocatable as all other range trees use per TXG * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); uint64_t object = space_map_object(msp->ms_sm); space_map_truncate(sm, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); /* * space_map_truncate() may have reallocated the spacemap object. * If so, update the vdev_ms_array. */ if (space_map_object(msp->ms_sm) != object) { object = space_map_object(msp->ms_sm); dmu_write(spa->spa_meta_objset, msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &object, tx); } /* * Note: * When the log space map feature is enabled, each space map will * always have ALLOCS followed by FREES for each sync pass. This is * typically true even when the log space map feature is disabled, * except from the case where a metaslab goes through metaslab_sync() * and gets condensed. In that case the metaslab's space map will have * ALLOCS followed by FREES (due to condensing) followed by ALLOCS * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL, start, shift); zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); zfs_range_tree_vacate(condense_tree, NULL, NULL); zfs_range_tree_destroy(condense_tree); zfs_range_tree_vacate(tmp_tree, NULL, NULL); zfs_range_tree_destroy(tmp_tree); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; metaslab_flush_update(msp, tx); } static void metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); mutex_enter(&spa->spa_flushed_ms_lock); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); metaslab_set_unflushed_dirty(msp, B_TRUE); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_increment_current_mscount(spa); spa_log_summary_add_flushed_metaslab(spa, B_TRUE); } void metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); /* update metaslab's position in our flushing tree */ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); metaslab_set_unflushed_dirty(msp, dirty); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); /* update metaslab counts of spa_log_sm_t nodes */ spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); spa_log_sm_increment_current_mscount(spa); /* update log space map summary */ spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg, ms_prev_flushed_dirty); spa_log_summary_add_flushed_metaslab(spa, dirty); /* cleanup obsolete logs if any */ spa_cleanup_old_sm_logs(spa, tx); } /* * Called when the metaslab has been flushed (its own spacemap now reflects * all the contents of the pool-wide spacemap log). Updates the metaslab's * metadata and any pool-wide related log space map data (e.g. summary, * obsolete logs, etc..) to reflect that. */ static void metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); /* * Just because a metaslab got flushed, that doesn't mean that * it will pass through metaslab_sync_done(). Thus, make sure to * update ms_synced_length here in case it doesn't. */ msp->ms_synced_length = space_map_length(msp->ms_sm); /* * We may end up here from metaslab_condense() without the * feature being active. In that case this is a no-op. */ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) || metaslab_unflushed_txg(msp) == 0) return; metaslab_unflushed_bump(msp, tx, B_FALSE); } boolean_t metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); /* * There is nothing wrong with flushing the same metaslab twice, as * this codepath should work on that case. However, the current * flushing scheme makes sure to avoid this situation as we would be * making all these calls without having anything meaningful to write * to disk. We assert this behavior here. */ ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); /* * We can not flush while loading, because then we would * not load the ms_unflushed_{allocs,frees}. */ if (msp->ms_loading) return (B_FALSE); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); /* * Metaslab condensing is effectively flushing. Therefore if the * metaslab can be condensed we can just condense it instead of * flushing it. * * Note that metaslab_condense() does call metaslab_flush_update() * so we can just return immediately after condensing. We also * don't need to care about setting ms_flushing or broadcasting * ms_flush_cv, even if we temporarily drop the ms_lock in * metaslab_condense(), as the metaslab is already loaded. */ if (msp->ms_loaded && metaslab_should_condense(msp)) { metaslab_group_t *mg = msp->ms_group; /* * For all histogram operations below refer to the * comments of metaslab_sync() where we follow a * similar procedure. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); metaslab_condense(msp, tx); space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); /* * Since we recreated the histogram (and potentially * the ms_sm too while condensing) ensure that the * weight is updated too because we are not guaranteed * that this metaslab is dirty and will go through * metaslab_sync_done(). */ metaslab_recalculate_weight_and_sort(msp); return (B_TRUE); } msp->ms_flushing = B_TRUE; uint64_t sm_len_before = space_map_length(msp->ms_sm); mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); uint64_t sm_len_after = space_map_length(msp->ms_sm); if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)zfs_range_tree_space( msp->ms_unflushed_allocs), (u_longlong_t)zfs_range_tree_space( msp->ms_unflushed_frees), (u_longlong_t)(sm_len_after - sm_len_before)); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); metaslab_flush_update(msp, tx); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); msp->ms_flushing = B_FALSE; cv_broadcast(&msp->ms_flush_cv); return (B_TRUE); } /* * Write a metaslab to disk in the context of the specified transaction group. */ void metaslab_sync(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); zfs_range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); /* * This metaslab has just been added so there's no work to do now. */ if (msp->ms_new) { ASSERT0(zfs_range_tree_space(alloctree)); ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_freed)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); ASSERT0(zfs_range_tree_space(msp->ms_trim)); return; } /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being * forced to condense, it's loaded and we're not beyond the final * dirty txg, we need to let it through. Not condensing beyond the * final dirty txg prevents an issue where metaslabs that need to be * condensed but were loaded for other reasons could cause a panic * here. By only checking the txg in that branch of the conditional, * we preserve the utility of the VERIFY statements in all other * cases. */ if (zfs_range_tree_is_empty(alloctree) && zfs_range_tree_is_empty(msp->ms_freeing) && zfs_range_tree_is_empty(msp->ms_checkpointing) && !(msp->ms_loaded && msp->ms_condense_wanted && txg <= spa_final_dirty_txg(spa))) return; VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); /* * The only state that can actually be changing concurrently * with metaslab_sync() is the metaslab's ms_allocatable. No * other thread can be modifying this txg's alloc, freeing, * freed, or space_map_phys_t. We drop ms_lock whenever we * could call into the DMU, because the DMU can call down to * us (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state * concurrently, and it is locked out by the ms_sync_lock. * Note that the ms_lock is insufficient for this, because it * is dropped by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); /* * Generate a log space map if one doesn't exist already. */ spa_generate_syncing_log_sm(spa, tx); if (msp->ms_sm == NULL) { uint64_t new_object = space_map_alloc(mos, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); VERIFY3U(new_object, !=, 0); dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &new_object, tx); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); ASSERT(msp->ms_sm != NULL); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); ASSERT0(metaslab_allocated_space(msp)); } if (!zfs_range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); uint64_t new_object = space_map_alloc(mos, zfs_vdev_standard_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * We save the space map object as an entry in vdev_top_zap * so it can be retrieved when the pool is reopened after an * export or through zdb. */ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (new_object), 1, &new_object, tx)); } mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Note: metaslab_condense() clears the space map's histogram. * Therefore we must verify and remove this histogram before * condensing. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); if (spa->spa_sync_pass == 1 && msp->ms_loaded && metaslab_should_condense(msp)) metaslab_condense(msp, tx); /* * We'll be going to disk to sync our space accounting, thus we * drop the ms_lock during that time so allocations coming from * open-context (ZIL) for future TXGs do not block. */ mutex_exit(&msp->ms_lock); space_map_t *log_sm = spa_syncing_log_sm(spa); if (log_sm != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); if (metaslab_unflushed_txg(msp) == 0) metaslab_unflushed_add(msp, tx); else if (!metaslab_unflushed_dirty(msp)) metaslab_unflushed_bump(msp, tx, B_TRUE); space_map_write(log_sm, alloctree, SM_ALLOC, vd->vdev_id, tx); space_map_write(log_sm, msp->ms_freeing, SM_FREE, vd->vdev_id, tx); mutex_enter(&msp->ms_lock); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); zfs_range_tree_remove_xor_add(alloctree, msp->ms_unflushed_frees, msp->ms_unflushed_allocs); zfs_range_tree_remove_xor_add(msp->ms_freeing, msp->ms_unflushed_allocs, msp->ms_unflushed_frees); spa->spa_unflushed_stats.sus_memused += metaslab_unflushed_changes_memused(msp); } else { ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); } msp->ms_allocated_space += zfs_range_tree_space(alloctree); ASSERT3U(msp->ms_allocated_space, >=, zfs_range_tree_space(msp->ms_freeing)); msp->ms_allocated_space -= zfs_range_tree_space(msp->ms_freeing); if (!zfs_range_tree_is_empty(msp->ms_checkpointing)) { ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the * ms_lock while writing to the checkpoint space map, for the * same reason mentioned above. */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); spa->spa_checkpoint_info.sci_dspace += zfs_range_tree_space(msp->ms_checkpointing); vd->vdev_stat.vs_checkpoint_space += zfs_range_tree_space(msp->ms_checkpointing); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, -space_map_allocated(vd->vdev_checkpoint_sm)); zfs_range_tree_vacate(msp->ms_checkpointing, NULL, NULL); } if (msp->ms_loaded) { /* * When the space map is loaded, we have an accurate * histogram in the range tree. This gives us an opportunity * to bring the space map's histogram up-to-date so we clear * it first before updating it. */ space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); /* * Since we've cleared the histogram we need to add back * any free space that has already been processed, plus * any deferred space. This allows the on-disk histogram * to accurately reflect all free space even if some space * is not yet available for allocation (i.e. deferred). */ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); /* * Add back any deferred free space that has not been * added back into the in-core free tree yet. This will * ensure that we don't end up with a space map histogram * that is completely empty unless the metaslab is fully * allocated. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } } /* * Always add the free space from this sync pass to the space * map histogram. We want to make sure that the on-disk histogram * accounts for all free space. If the space map is not loaded, * then we will lose some accuracy but will correct it the next * time we load the space map. */ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); /* * For sync pass 1, we avoid traversing this txg's free range tree * and instead will just swap the pointers for freeing and freed. * We can safely do this since the freed_tree is guaranteed to be * empty on the initial pass. * * Keep in mind that even if we are currently using a log spacemap * we want current frees to end up in the ms_allocatable (but not * get appended to the ms_sm) so their ranges can be reused as usual. */ if (spa_sync_pass(spa) == 1) { zfs_range_tree_swap(&msp->ms_freeing, &msp->ms_freed); ASSERT0(msp->ms_allocated_this_txg); } else { zfs_range_tree_vacate(msp->ms_freeing, zfs_range_tree_add, msp->ms_freed); } msp->ms_allocated_this_txg += zfs_range_tree_space(alloctree); zfs_range_tree_vacate(alloctree, NULL, NULL); ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(zfs_range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) & TXG_MASK])); ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); /* * Verify that the space map object ID has been recorded in the * vdev_ms_array. */ uint64_t object; VERIFY0(dmu_read(mos, vd->vdev_ms_array, msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); VERIFY3U(object, ==, space_map_object(msp->ms_sm)); mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } static void metaslab_evict(metaslab_t *msp, uint64_t txg) { if (!msp->ms_loaded || msp->ms_disabled != 0) return; for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(zfs_range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } if (msp->ms_allocator != -1) metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); if (!metaslab_debug_unload) metaslab_unload(msp); } /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. */ void metaslab_sync_done(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; zfs_range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; boolean_t defer_allowed = B_TRUE; ASSERT(!vd->vdev_ishole); mutex_enter(&msp->ms_lock); if (msp->ms_new) { /* this is a new metaslab, add its capacity to the vdev */ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); /* there should be no allocations nor frees at this point */ VERIFY0(msp->ms_allocated_this_txg); VERIFY0(zfs_range_tree_space(msp->ms_freed)); } ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || vd->vdev_rz_expanding) { defer_allowed = B_FALSE; } defer_delta = 0; alloc_delta = msp->ms_allocated_this_txg - zfs_range_tree_space(msp->ms_freed); if (defer_allowed) { defer_delta = zfs_range_tree_space(msp->ms_freed) - zfs_range_tree_space(*defer_tree); } else { defer_delta -= zfs_range_tree_space(*defer_tree); } metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, defer_delta, 0); if (spa_syncing_log_sm(spa) == NULL) { /* * If there's a metaslab_load() in progress and we don't have * a log space map, it means that we probably wrote to the * metaslab's space map. If this is the case, we need to * make sure that we wait for the load to complete so that we * have a consistent view at the in-core side of the metaslab. */ metaslab_load_wait(msp); } else { ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); } /* * When auto-trimming is enabled, free ranges which are added to * ms_allocatable are also be added to ms_trim. The ms_trim tree is * periodically consumed by the vdev_autotrim_thread() which issues * trims for all ranges and then vacates the tree. The ms_trim tree * can be discarded at any time with the sole consequence of recent * frees not being trimmed. */ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { zfs_range_tree_walk(*defer_tree, zfs_range_tree_add, msp->ms_trim); if (!defer_allowed) { zfs_range_tree_walk(msp->ms_freed, zfs_range_tree_add, msp->ms_trim); } } else { zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); } /* * Move the frees from the defer_tree back to the free * range tree (if it's loaded). Swap the freed_tree and * the defer_tree -- this is safe to do because we've * just emptied out the defer_tree. */ zfs_range_tree_vacate(*defer_tree, msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { zfs_range_tree_swap(&msp->ms_freed, defer_tree); } else { zfs_range_tree_vacate(msp->ms_freed, msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable); } msp->ms_synced_length = space_map_length(msp->ms_sm); msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); if (msp->ms_deferspace != 0) { /* * Keep syncing this metaslab until all deferred frees * are back in circulation. */ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } metaslab_aux_histograms_update_done(msp, defer_allowed); if (msp->ms_new) { msp->ms_new = B_FALSE; mutex_enter(&mg->mg_lock); mg->mg_ms_ready++; mutex_exit(&mg->mg_lock); } /* * Re-sort metaslab within its group now that we've adjusted * its allocatable space. */ metaslab_recalculate_weight_and_sort(msp); ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(zfs_range_tree_space(msp->ms_freeing)); ASSERT0(zfs_range_tree_space(msp->ms_freed)); ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); msp->ms_allocating_total -= msp->ms_allocated_this_txg; msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); } void metaslab_sync_reassess(metaslab_group_t *mg) { spa_t *spa = mg->mg_class->mc_spa; spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); mg->mg_fragmentation = metaslab_group_fragmentation(mg); metaslab_group_alloc_update(mg); /* * Preload the next potential metaslabs but only on active * metaslab groups. We can get into a state where the metaslab * is no longer active since we dirty metaslabs as we remove a * a device, thus potentially making the metaslab group eligible * for preloading. */ if (mg->mg_activation_count > 0) { metaslab_group_preload(mg); } spa_config_exit(spa, SCL_ALLOC, FTAG); } /* * When writing a ditto block (i.e. more than one DVA for a given BP) on * the same vdev as an existing DVA of this BP, then try to allocate it * on a different metaslab than existing DVAs (i.e. a unique metaslab). */ static boolean_t metaslab_is_unique(metaslab_t *msp, dva_t *dva) { uint64_t dva_ms_id; if (DVA_GET_ASIZE(dva) == 0) return (B_TRUE); if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (B_TRUE); dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; return (msp->ms_id != dva_ms_id); } /* * ========================================================================== * Metaslab allocation tracing facility * ========================================================================== */ /* * Add an allocation trace element to the allocation tracing list. */ static void metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, int allocator) { metaslab_alloc_trace_t *mat; if (!metaslab_trace_enabled) return; /* * When the tracing list reaches its maximum we remove * the second element in the list before adding a new one. * By removing the second element we preserve the original * entry as a clue to what allocations steps have already been * performed. */ if (zal->zal_size == metaslab_trace_max_entries) { metaslab_alloc_trace_t *mat_next; #ifdef ZFS_DEBUG panic("too many entries in allocation list"); #endif METASLABSTAT_BUMP(metaslabstat_trace_over_limit); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next); kmem_cache_free(metaslab_alloc_trace_cache, mat_next); } mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); list_link_init(&mat->mat_list_node); mat->mat_mg = mg; mat->mat_msp = msp; mat->mat_size = psize; mat->mat_dva_id = dva_id; mat->mat_offset = offset; mat->mat_weight = 0; mat->mat_allocator = allocator; if (msp != NULL) mat->mat_weight = msp->ms_weight; /* * The list is part of the zio so locking is not required. Only * a single thread will perform allocations for a given zio. */ list_insert_tail(&zal->zal_list, mat); zal->zal_size++; ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); } void metaslab_trace_move(zio_alloc_list_t *old, zio_alloc_list_t *new) { ASSERT0(new->zal_size); list_move_tail(&new->zal_list, &old->zal_list); new->zal_size = old->zal_size; list_destroy(&old->zal_list); } void metaslab_trace_init(zio_alloc_list_t *zal) { list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), offsetof(metaslab_alloc_trace_t, mat_list_node)); zal->zal_size = 0; } void metaslab_trace_fini(zio_alloc_list_t *zal) { metaslab_alloc_trace_t *mat; while ((mat = list_remove_head(&zal->zal_list)) != NULL) kmem_cache_free(metaslab_alloc_trace_cache, mat); list_destroy(&zal->zal_list); zal->zal_size = 0; } /* * ========================================================================== * Metaslab block operations * ========================================================================== */ static void metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator, int flags, uint64_t psize, const void *tag) { if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; (void) zfs_refcount_add_many(&mga->mga_queue_depth, psize, tag); } void metaslab_group_alloc_increment_all(spa_t *spa, blkptr_t *bp, int allocator, int flags, uint64_t psize, const void *tag) { for (int d = 0; d < BP_GET_NDVAS(bp); d++) { uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[d]); metaslab_group_alloc_increment(spa, vdev, allocator, flags, psize, tag); } } void metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, int allocator, int flags, uint64_t psize, const void *tag) { if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; (void) zfs_refcount_remove_many(&mga->mga_queue_depth, psize, tag); } static uint64_t metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size, uint64_t txg, uint64_t *actual_size) { uint64_t start; zfs_range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); VERIFY0(msp->ms_new); start = mc->mc_ops->msop_alloc(msp, size, max_size, actual_size); if (start != -1ULL) { size = *actual_size; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(zfs_range_tree_space(rt) - size, <=, msp->ms_size); zfs_range_tree_remove(rt, start, size); zfs_range_tree_clear(msp->ms_trim, start, size); if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); msp->ms_allocating_total += size; /* Track the last successful allocation */ msp->ms_alloc_txg = txg; metaslab_verify_space(msp, txg); } /* * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ msp->ms_max_size = metaslab_largest_allocatable(msp); return (start); } /* * Find the metaslab with the highest weight that is less than what we've * already tried. In the common case, this means that we will examine each * metaslab at most once. Note that concurrent callers could reorder metaslabs * by activation/passivation once we have dropped the mg_lock. If a metaslab is * activated by another thread, and we fail to allocate from the metaslab we * have selected, we may not try the newly-activated metaslab, and instead * activate another metaslab. This is not optimal, but generally does not cause * any problems (a possible exception being if every metaslab is completely full * except for the newly-activated metaslab which we fail to examine). */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, uint64_t asize, int allocator, boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; metaslab_t *msp = avl_find(t, search, &idx); if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); uint_t tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; if (!try_hard && tries > zfs_metaslab_find_max_tries) { METASLABSTAT_BUMP(metaslabstat_too_many_tries); return (NULL); } tries++; if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; } /* * If the selected metaslab is condensing or disabled, or * hasn't gone through a metaslab_sync_done(), then skip it. */ if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new) continue; *was_active = msp->ms_allocator != -1; /* * If we're activating as primary, this is our first allocation * from this disk, so we don't need to check how close we are. * If the metaslab under consideration was already active, * we're getting desperate enough to steal another allocator's * metaslab, so we still don't care about distances. */ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) break; if (!try_hard) { for (i = 0; i < d; i++) { if (!metaslab_is_unique(msp, &dva[i])) break; /* try another metaslab */ } if (i == d) break; } } if (msp != NULL) { search->ms_weight = msp->ms_weight; search->ms_start = msp->ms_start + 1; search->ms_allocator = msp->ms_allocator; search->ms_primary = msp->ms_primary; } return (msp); } static void metaslab_active_mask_verify(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) return; if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(!msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY3S(msp->ms_allocator, ==, -1); return; } } static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t max_asize, uint64_t txg, dva_t *dva, int d, int allocator, boolean_t try_hard, uint64_t *actual_asize) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; for (int i = 0; i < d; i++) { if (activation_weight == METASLAB_WEIGHT_PRIMARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_CLAIM; break; } } /* * If we don't have enough metaslabs active, we just use the 0th slot. */ if (allocator >= mg->mg_ms_ready / 3) allocator = 0; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); search->ms_weight = UINT64_MAX; search->ms_start = 0; /* * At the end of the metaslab tree are the already-active metaslabs, * first the primaries, then the secondaries. When we resume searching * through the tree, we need to consider ms_allocator and ms_primary so * we start in the location right after where we left off, and don't * accidentally loop forever considering the same metaslabs. */ search->ms_allocator = -1; search->ms_primary = B_TRUE; for (;;) { boolean_t was_active = B_FALSE; mutex_enter(&mg->mg_lock); if (activation_weight == METASLAB_WEIGHT_PRIMARY && mga->mga_primary != NULL) { msp = mga->mga_primary; /* * Even though we don't hold the ms_lock for the * primary metaslab, those fields should not * change while we hold the mg_lock. Thus it is * safe to make assertions on them. */ ASSERT(msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && mga->mga_secondary != NULL) { msp = mga->mga_secondary; /* * See comment above about the similar assertions * for the primary metaslab. */ ASSERT(!msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, asize, allocator, try_hard, zal, search, &was_active); } mutex_exit(&mg->mg_lock); if (msp == NULL) break; mutex_enter(&msp->ms_lock); metaslab_active_mask_verify(msp); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE3(ms__activation__attempt, metaslab_t *, msp, uint64_t, activation_weight, boolean_t, was_active); #endif /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. We check the * active status first to see if we need to set_selected_txg * a new metaslab. */ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { ASSERT3S(msp->ms_allocator, ==, -1); mutex_exit(&msp->ms_lock); continue; } /* * If the metaslab was activated for another allocator * while we were waiting in the ms_lock above, or it's * a primary and we're seeking a secondary (or vice versa), * we go back and select a new metaslab. */ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && (msp->ms_allocator != -1) && (msp->ms_allocator != allocator || ((activation_weight == METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { ASSERT(msp->ms_loaded); ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || msp->ms_allocator != -1); mutex_exit(&msp->ms_lock); continue; } /* * This metaslab was used for claiming regions allocated * by the ZIL during pool import. Once these regions are * claimed we don't need to keep the CLAIM bit set * anymore. Passivate this metaslab to zero its activation * mask. */ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && activation_weight != METASLAB_WEIGHT_CLAIM) { ASSERT(msp->ms_loaded); ASSERT3S(msp->ms_allocator, ==, -1); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } metaslab_set_selected_txg(msp, txg); int activation_error = metaslab_activate(msp, allocator, activation_weight); metaslab_active_mask_verify(msp); /* * If the metaslab was activated by another thread for * another allocator or activation_weight (EBUSY), or it * failed because another metaslab was assigned as primary * for this allocator (EEXIST) we continue using this * metaslab for our allocation, rather than going on to a * worse metaslab (we waited for that metaslab to be loaded * after all). * * If the activation failed due to an I/O error or ENOSPC we * skip to the next metaslab. */ boolean_t activated; if (activation_error == 0) { activated = B_TRUE; } else if (activation_error == EBUSY || activation_error == EEXIST) { activated = B_FALSE; } else { mutex_exit(&msp->ms_lock); continue; } ASSERT(msp->ms_loaded); /* * Now that we have the lock, recheck to see if we should * continue to use this metaslab for this allocation. The * the metaslab is now loaded so metaslab_should_allocate() * can accurately determine if the allocation attempt should * proceed. */ if (!metaslab_should_allocate(msp, asize, try_hard)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); goto next; } /* * If this metaslab is currently condensing then pick again * as we can't manipulate this metaslab until it's committed * to disk. If this metaslab is being initialized, we shouldn't * allocate from it since the allocated region might be * overwritten after allocation. */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } else if (msp->ms_disabled > 0) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_DISABLED, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } offset = metaslab_block_alloc(msp, asize, max_asize, txg, actual_asize); if (offset != -1ULL) { metaslab_trace_add(zal, mg, msp, *actual_asize, d, offset, allocator); /* Proactively passivate the metaslab, if needed */ if (activated) metaslab_segment_may_passivate(msp); mutex_exit(&msp->ms_lock); break; } metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); next: ASSERT(msp->ms_loaded); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, uint64_t, asize); #endif /* * We were unable to allocate from this metaslab so determine * a new weight for this metaslab. Now that we have loaded * the metaslab we can provide a better hint to the metaslab * selector. * * For space-based metaslabs, we use the maximum block size. * This information is only available when the metaslab * is loaded and is more accurate than the generic free * space weight that was calculated by metaslab_weight(). * This information allows us to quickly compare the maximum * available allocation in the metaslab to the allocation * size being requested. * * For segment-based metaslabs, determine the new weight * based on the highest bucket in the range tree. We * explicitly use the loaded segment weight (i.e. the range * tree histogram) since it contains the space that is * currently available for allocation and is accurate * even within a sync pass. */ uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { weight = metaslab_largest_allocatable(msp); WEIGHT_SET_SPACEBASED(weight); } else { weight = metaslab_weight_from_range_tree(msp); } if (activated) { metaslab_passivate(msp, weight); } else { /* * For the case where we use the metaslab that is * active for another allocator we want to make * sure that we retain the activation mask. * * Note that we could attempt to use something like * metaslab_recalculate_weight_and_sort() that * retains the activation mask here. That function * uses metaslab_weight() to set the weight though * which is not as accurate as the calculations * above. */ weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(mg, msp, weight); } metaslab_active_mask_verify(msp); /* * We have just failed an allocation attempt, check * that metaslab_should_allocate() agrees. Otherwise, * we may end up in an infinite loop retrying the same * metaslab. */ ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); mutex_exit(&msp->ms_lock); } kmem_free(search, sizeof (*search)); if (offset == -1ULL) { metaslab_trace_add(zal, mg, NULL, asize, d, TRACE_GROUP_FAILURE, allocator); if (asize <= vdev_get_min_alloc(mg->mg_vd)) { /* * This metaslab group was unable to allocate * the minimum block size so it must be out of * space. Notify the allocation throttle to * skip allocation attempts to this group until * more space becomes available. */ mg->mg_no_free_space = B_TRUE; } } return (offset); } static boolean_t metaslab_group_allocatable(spa_t *spa, metaslab_group_t *mg, uint64_t psize, int d, int flags, boolean_t try_hard, zio_alloc_list_t *zal, int allocator) { metaslab_class_t *mc = mg->mg_class; vdev_t *vd = mg->mg_vd; boolean_t allocatable; /* * Don't allocate from faulted devices. */ if (try_hard) spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); allocatable = vdev_allocatable(vd); if (try_hard) spa_config_exit(spa, SCL_ZIO, FTAG); if (!allocatable) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_NOT_ALLOCATABLE, allocator); return (B_FALSE); } if (!try_hard) { /* * Avoid vdevs with too little space or too fragmented. */ if (!GANG_ALLOCATION(flags) && (mg->mg_no_free_space || (!mg->mg_allocatable && mc->mc_alloc_groups > 0))) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_NOT_ALLOCATABLE, allocator); return (B_FALSE); } /* * Avoid writing single-copy data to an unhealthy, * non-redundant vdev. */ if (d == 0 && vd->vdev_state < VDEV_STATE_HEALTHY && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_VDEV_ERROR, allocator); return (B_FALSE); } } return (B_TRUE); } static int metaslab_alloc_dva_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, uint64_t max_psize, dva_t *dva, int d, const dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal, int allocator, uint64_t *actual_psize) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_t *mg = NULL, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; ASSERT(!DVA_IS_VALID(&dva[d])); /* * For testing, make some blocks above a certain size be gang blocks. * This will result in more split blocks when using device removal, * and a large number of split blocks coupled with ztest-induced * damage can result in extremely long reconstruction times. This * will also test spilling from special to normal. */ if (psize >= metaslab_force_ganging && metaslab_force_ganging_pct > 0 && (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); } if (max_psize > psize && max_psize >= metaslab_force_ganging && metaslab_force_ganging_pct > 0 && (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { max_psize = MAX((psize + max_psize) / 2, metaslab_force_ganging); } ASSERT3U(psize, <=, max_psize); /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mca_rotor or mca_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * * If we are doing ditto or log blocks, try to spread them across * consecutive vdevs. If we're forced to reuse a vdev before we've * allocated all of our ditto blocks, then try and spread them out on * that vdev as much as possible. If it turns out to not be possible, * gradually lower our standards until anything becomes acceptable. * Also, allocating on consecutive vdevs (as opposed to random vdevs) * gives us hope of containing our fault domains to something we're * able to reason about. Otherwise, any two top-level vdev failures * will guarantee the loss of data. With consecutive allocation, * only two adjacent top-level vdev failures will result in data loss. * * If we are doing gang blocks (hintdva is non-NULL), try to keep * ourselves on the same vdev as our gang block header. It makes our * fault domains something tractable. */ if (hintdva && DVA_IS_VALID(&hintdva[d])) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); mg = vdev_get_mg(vd, mc); } if (mg == NULL && d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vdev_get_mg(vd, mc)->mg_next; } if (mg == NULL || mg->mg_class != mc || mg->mg_activation_count <= 0) { ASSERT(mca->mca_rotor != NULL); mg = mca->mca_rotor; } rotor = mg; top: do { ASSERT(mg->mg_activation_count == 1); ASSERT(mg->mg_class == mc); if (!metaslab_group_allocatable(spa, mg, psize, d, flags, try_hard, zal, allocator)) goto next; vd = mg->mg_vd; uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); ASSERT0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); uint64_t max_asize = vdev_psize_to_asize_txg(vd, max_psize, txg); ASSERT0(P2PHASE(max_asize, 1ULL << vd->vdev_ashift)); uint64_t offset = metaslab_group_alloc(mg, zal, asize, max_asize, txg, dva, d, allocator, try_hard, &asize); if (offset != -1ULL) { if (actual_psize) *actual_psize = vdev_asize_to_psize_txg(vd, asize, txg); metaslab_class_rotate(mg, allocator, psize, B_TRUE); DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_OFFSET(&dva[d], offset); DVA_SET_GANG(&dva[d], ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); DVA_SET_ASIZE(&dva[d], asize); return (0); } next: metaslab_class_rotate(mg, allocator, psize, B_FALSE); } while ((mg = mg->mg_next) != rotor); /* * If we haven't tried hard, perhaps do so now. */ if (!try_hard && (zfs_metaslab_try_hard_before_gang || GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || psize <= spa->spa_min_alloc)) { METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; } memset(&dva[d], 0, sizeof (dva_t)); metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); } /* * Allocate a block for the specified i/o. */ int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, const dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal, int allocator) { return (metaslab_alloc_dva_range(spa, mc, psize, psize, dva, d, hintdva, txg, flags, zal, allocator, NULL)); } void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, boolean_t checkpoint) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; int m = offset >> vd->vdev_ms_shift; ASSERT(vdev_is_concrete(vd)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); VERIFY3U(m, <, vd->vdev_ms_count); msp = vd->vdev_ms[m]; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); metaslab_check_free_impl(vd, offset, asize); mutex_enter(&msp->ms_lock); if (zfs_range_tree_is_empty(msp->ms_freeing) && zfs_range_tree_is_empty(msp->ms_checkpointing)) { vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); } if (checkpoint) { ASSERT(spa_has_checkpoint(spa)); zfs_range_tree_add(msp->ms_checkpointing, offset, asize); } else { zfs_range_tree_add(msp->ms_freeing, offset, asize); } mutex_exit(&msp->ms_lock); } void metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset; boolean_t *checkpoint = arg; ASSERT3P(checkpoint, !=, NULL); if (vd->vdev_ops->vdev_op_remap != NULL) vdev_indirect_mark_obsolete(vd, offset, size); else metaslab_free_impl(vd, offset, size, *checkpoint); } static void metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, boolean_t checkpoint) { spa_t *spa = vd->vdev_spa; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) return; if (spa->spa_vdev_removal != NULL && spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && vdev_is_concrete(vd)) { /* * Note: we check if the vdev is concrete because when * we complete the removal, we first change the vdev to be * an indirect vdev (in open context), and then (in syncing * context) clear spa_vdev_removal. */ free_from_removing_vdev(vd, offset, size); } else if (vd->vdev_ops->vdev_op_remap != NULL) { vdev_indirect_mark_obsolete(vd, offset, size); vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } else { metaslab_free_concrete(vd, offset, size, checkpoint); } } typedef struct remap_blkptr_cb_arg { blkptr_t *rbca_bp; spa_remap_cb_t rbca_cb; vdev_t *rbca_remap_vd; uint64_t rbca_remap_offset; void *rbca_cb_arg; } remap_blkptr_cb_arg_t; static void remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { remap_blkptr_cb_arg_t *rbca = arg; blkptr_t *bp = rbca->rbca_bp; /* We can not remap split blocks. */ if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) return; ASSERT0(inner_offset); if (rbca->rbca_cb != NULL) { /* * At this point we know that we are not handling split * blocks and we invoke the callback on the previous * vdev which must be indirect. */ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); /* set up remap_blkptr_cb_arg for the next call */ rbca->rbca_remap_vd = vd; rbca->rbca_remap_offset = offset; } /* * The phys birth time is that of dva[0]. This ensures that we know * when each dva was written, so that resilver can determine which * blocks need to be scrubbed (i.e. those written during the time * the vdev was offline). It also ensures that the key used in * the ARC hash table is unique (i.e. dva[0] + phys_birth). If * we didn't change the phys_birth, a lookup in the ARC for a * remapped BP could find the data that was previously stored at * this vdev + offset. */ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, DVA_GET_VDEV(&bp->blk_dva[0])); vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; uint64_t physical_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); BP_SET_PHYSICAL_BIRTH(bp, physical_birth); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); } /* * If the block pointer contains any indirect DVAs, modify them to refer to * concrete DVAs. Note that this will sometimes not be possible, leaving * the indirect DVA in place. This happens if the indirect DVA spans multiple * segments in the mapping (i.e. it is a "split block"). * * If the BP was remapped, calls the callback on the original dva (note the * callback can be called multiple times if the original indirect DVA refers * to another indirect DVA, etc). * * Returns TRUE if the BP was remapped. */ boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) { remap_blkptr_cb_arg_t rbca; if (!zfs_remap_blkptr_enable) return (B_FALSE); if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) return (B_FALSE); /* * Dedup BP's can not be remapped, because ddt_phys_select() depends * on DVA[0] being the same in the BP as in the DDT (dedup table). */ if (BP_GET_DEDUP(bp)) return (B_FALSE); /* * Gang blocks can not be remapped, because * zio_checksum_gang_verifier() depends on the DVA[0] that's in * the BP used to read the gang block header (GBH) being the same * as the DVA[0] that we allocated for the GBH. */ if (BP_IS_GANG(bp)) return (B_FALSE); /* * Embedded BP's have no DVA to remap. */ if (BP_GET_NDVAS(bp) < 1) return (B_FALSE); /* * Cloned blocks can not be remapped since BRT depends on specific * vdev id and offset in the DVA[0] for its reference counting. */ if (!BP_IS_METADATA(bp) && brt_maybe_exists(spa, bp)) return (B_FALSE); /* * Note: we only remap dva[0]. If we remapped other dvas, we * would no longer know what their phys birth txg is. */ dva_t *dva = &bp->blk_dva[0]; uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops->vdev_op_remap == NULL) return (B_FALSE); rbca.rbca_bp = bp; rbca.rbca_cb = callback; rbca.rbca_remap_vd = vd; rbca.rbca_remap_offset = offset; rbca.rbca_cb_arg = arg; /* * remap_blkptr_cb() will be called in order for each level of * indirection, until a concrete vdev is reached or a split block is * encountered. old_vd and old_offset are updated within the callback * as we go from the one indirect vdev to the next one (either concrete * or indirect again) in that order. */ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); /* Check if the DVA wasn't remapped because it is a split block */ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) return (B_FALSE); return (B_TRUE); } /* * Undo the allocation of a DVA which happened in the given transaction group. */ void metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { metaslab_t *msp; vdev_t *vd; uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (txg > spa_freeze_txg(spa)) return; if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) || (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu", (u_longlong_t)vdev, (u_longlong_t)offset, (u_longlong_t)size); return; } ASSERT(!vd->vdev_removing); ASSERT(vdev_is_concrete(vd)); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); if (DVA_GET_GANG(dva)) size = vdev_gang_header_asize(vd); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); zfs_range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total -= size; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); zfs_range_tree_add(msp->ms_allocatable, offset, size); mutex_exit(&msp->ms_lock); } /* * Free the block represented by the given DVA. */ void metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, vdev); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (DVA_GET_GANG(dva)) { size = vdev_gang_header_asize(vd); } metaslab_free_impl(vd, offset, size, checkpoint); } /* * Reserve some allocation slots. The reservation system must be called * before we call into the allocator. If there aren't any available slots * then the I/O will be throttled until an I/O completes and its slots are * freed up. The function returns true if it was successful in placing * the reservation. */ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, boolean_t must, boolean_t *more) { metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; ASSERT(mc->mc_alloc_throttle_enabled); if (mc->mc_alloc_io_size < zio->io_size) { mc->mc_alloc_io_size = zio->io_size; metaslab_class_balance(mc, B_FALSE); } if (must || mca->mca_reserved <= mc->mc_alloc_max) { /* * The potential race between compare and add is covered by the * allocator lock in most cases, or irrelevant due to must set. * But even if we assume some other non-existing scenario, the * worst that can happen is few more I/Os get to allocation * earlier, that is not a problem. */ int64_t delta = slots * zio->io_size; *more = (atomic_add_64_nv(&mca->mca_reserved, delta) <= mc->mc_alloc_max); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } *more = B_FALSE; return (B_FALSE); } boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) { metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; ASSERT(mc->mc_alloc_throttle_enabled); int64_t delta = slots * zio->io_size; return (atomic_add_64_nv(&mca->mca_reserved, -delta) <= mc->mc_alloc_max); } static int metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; int error = 0; if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) return (SET_ERROR(ENXIO)); ASSERT3P(vd->vdev_ms, !=, NULL); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); if (error == EBUSY) { ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); error = 0; } } if (error == 0 && !zfs_range_tree_contains(msp->ms_allocatable, offset, size)) error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } VERIFY(!msp->ms_condensing); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) - size, <=, msp->ms_size); zfs_range_tree_remove(msp->ms_allocatable, offset, size); zfs_range_tree_clear(msp->ms_trim, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (!multilist_link_active(&msp->ms_class_txg_node)) { msp->ms_selected_txg = txg; multilist_sublist_insert_head(mls, msp); } multilist_sublist_unlock(mls); if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total += size; } mutex_exit(&msp->ms_lock); return (0); } typedef struct metaslab_claim_cb_arg_t { uint64_t mcca_txg; int mcca_error; } metaslab_claim_cb_arg_t; static void metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset; metaslab_claim_cb_arg_t *mcca_arg = arg; if (mcca_arg->mcca_error == 0) { mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, size, mcca_arg->mcca_txg); } } int metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { if (vd->vdev_ops->vdev_op_remap != NULL) { metaslab_claim_cb_arg_t arg; /* * Only zdb(8) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ ASSERT(!spa_writeable(vd->vdev_spa)); arg.mcca_error = 0; arg.mcca_txg = txg; vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_claim_impl_cb, &arg); if (arg.mcca_error == 0) { arg.mcca_error = metaslab_claim_concrete(vd, offset, size, txg); } return (arg.mcca_error); } else { return (metaslab_claim_concrete(vd, offset, size, txg)); } } /* * Intent log support: upon opening the pool after a crash, notify the SPA * of blocks that the intent log has allocated for immediate write, but * which are still considered free by the SPA because the last transaction * group didn't commit yet. */ static int metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { return (SET_ERROR(ENXIO)); } ASSERT(DVA_IS_VALID(dva)); if (DVA_GET_GANG(dva)) size = vdev_gang_header_asize(vd); return (metaslab_claim_impl(vd, offset, size, txg)); } int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, const blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, int allocator, const void *tag) { return (metaslab_alloc_range(spa, mc, psize, psize, bp, ndvas, txg, hintbp, flags, zal, allocator, tag, NULL)); } int metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, uint64_t max_psize, blkptr_t *bp, int ndvas, uint64_t txg, const blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, int allocator, const void *tag, uint64_t *actual_psize) { dva_t *dva = bp->blk_dva; const dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); if (mc->mc_allocator[allocator].mca_rotor == NULL) { /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); return (SET_ERROR(ENOSPC)); } ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); uint64_t cur_psize = 0; for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva_range(spa, mc, psize, max_psize, dva, d, hintdva, txg, flags, zal, allocator, actual_psize ? &cur_psize : NULL); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, DVA_GET_VDEV(&dva[d]), allocator, flags, psize, tag); memset(&dva[d], 0, sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); } else { /* * Update the metaslab group's queue depth * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), allocator, flags, psize, tag); if (actual_psize) max_psize = MIN(cur_psize, max_psize); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); if (actual_psize) *actual_psize = max_psize; spa_config_exit(spa, SCL_ALLOC, FTAG); BP_SET_BIRTH(bp, txg, 0); return (0); } void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that * the blocks that we free that are part of the checkpoint won't be * reused until the checkpoint is discarded or we revert to it. * * The checkpoint flag is passed down the metaslab_free code path * and is set whenever we want to add a block to the checkpoint's * accounting. That is, we "checkpoint" blocks that existed at the * time the checkpoint was created and are therefore referenced by * the checkpointed uberblock. * * Note that, we don't checkpoint any blocks if the current * syncing txg <= spa_checkpoint_txg. We want these frees to sync * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint * there is no way it was created in the current txg. */ ASSERT(!now); ASSERT3U(spa_syncing_txg(spa), ==, txg); checkpoint = B_TRUE; } spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { if (now) { metaslab_unalloc_dva(spa, &dva[d], txg); } else { ASSERT3U(txg, ==, spa_syncing_txg(spa)); metaslab_free_dva(spa, &dva[d], checkpoint); } } spa_config_exit(spa, SCL_FREE, FTAG); } int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); int error = 0; ASSERT(!BP_IS_HOLE(bp)); if (txg != 0) { /* * First do a dry run to make sure all DVAs are claimable, * so we don't have to unwind from partial failures below. */ if ((error = metaslab_claim(spa, bp, 0)) != 0) return (error); } spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { error = metaslab_claim_dva(spa, &dva[d], txg); if (error != 0) break; } spa_config_exit(spa, SCL_ALLOC, FTAG); ASSERT(error == 0 || txg == 0); return (error); } static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner, (void) arg; if (vd->vdev_ops == &vdev_indirect_ops) return; metaslab_check_free_impl(vd, offset, size); } static void metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) { metaslab_t *msp; spa_t *spa __maybe_unused = vd->vdev_spa; if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; if (vd->vdev_ops->vdev_op_remap != NULL) { vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_check_free_impl_cb, NULL); return; } ASSERT(vdev_is_concrete(vd)); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if (msp->ms_loaded) { zfs_range_tree_verify_not_present(msp->ms_allocatable, offset, size); } /* * Check all segments that currently exist in the freeing pipeline. * * It would intuitively make sense to also check the current allocating * tree since metaslab_unalloc_dva() exists for extents that are * allocated and freed in the same sync pass within the same txg. * Unfortunately there are places (e.g. the ZIL) where we allocate a * segment but then we free part of it within the same txg * [see zil_sync()]. Thus, we don't call zfs_range_tree_verify() in the * current allocating tree. */ zfs_range_tree_verify_not_present(msp->ms_freeing, offset, size); zfs_range_tree_verify_not_present(msp->ms_checkpointing, offset, size); zfs_range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) zfs_range_tree_verify_not_present(msp->ms_defer[j], offset, size); zfs_range_tree_verify_not_present(msp->ms_trim, offset, size); mutex_exit(&msp->ms_lock); } void metaslab_check_free(spa_t *spa, const blkptr_t *bp) { if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); if (DVA_GET_GANG(&bp->blk_dva[i])) size = vdev_gang_header_asize(vd); ASSERT3P(vd, !=, NULL); metaslab_check_free_impl(vd, offset, size); } spa_config_exit(spa, SCL_VDEV, FTAG); } static void metaslab_group_disable_wait(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); while (mg->mg_disabled_updating) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } } static void metaslab_group_disabled_increment(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); ASSERT(mg->mg_disabled_updating); while (mg->mg_ms_disabled >= max_disabled_ms) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } mg->mg_ms_disabled++; ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); } /* * Mark the metaslab as disabled to prevent any allocations on this metaslab. * We must also track how many metaslabs are currently disabled within a * metaslab group and limit them to prevent allocation failures from * occurring because all metaslabs are disabled. */ void metaslab_disable(metaslab_t *msp) { ASSERT(!MUTEX_HELD(&msp->ms_lock)); metaslab_group_t *mg = msp->ms_group; mutex_enter(&mg->mg_ms_disabled_lock); /* * To keep an accurate count of how many threads have disabled * a specific metaslab group, we only allow one thread to mark * the metaslab group at a time. This ensures that the value of * ms_disabled will be accurate when we decide to mark a metaslab * group as disabled. To do this we force all other threads * to wait till the metaslab's mg_disabled_updating flag is no * longer set. */ metaslab_group_disable_wait(mg); mg->mg_disabled_updating = B_TRUE; if (msp->ms_disabled == 0) { metaslab_group_disabled_increment(mg); } mutex_enter(&msp->ms_lock); msp->ms_disabled++; mutex_exit(&msp->ms_lock); mg->mg_disabled_updating = B_FALSE; cv_broadcast(&mg->mg_ms_disabled_cv); mutex_exit(&mg->mg_ms_disabled_lock); } void metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; /* * Wait for the outstanding IO to be synced to prevent newly * allocated blocks from being overwritten. This used by * initialize and TRIM which are modifying unallocated space. */ if (sync) txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&mg->mg_ms_disabled_lock); mutex_enter(&msp->ms_lock); if (--msp->ms_disabled == 0) { mg->mg_ms_disabled--; cv_broadcast(&mg->mg_ms_disabled_cv); if (unload) metaslab_unload(msp); } mutex_exit(&msp->ms_lock); mutex_exit(&mg->mg_ms_disabled_lock); } void metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) { ms->ms_unflushed_dirty = dirty; } static void metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) { vdev_t *vd = ms->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); metaslab_unflushed_phys_t entry = { .msp_unflushed_txg = metaslab_unflushed_txg(ms), }; uint64_t entry_size = sizeof (entry); uint64_t entry_offset = ms->ms_id * entry_size; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) { object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object, tx)); } else { VERIFY0(err); } dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, &entry, tx); } void metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) { ms->ms_unflushed_txg = txg; metaslab_update_ondisk_flush_data(ms, tx); } boolean_t metaslab_unflushed_dirty(metaslab_t *ms) { return (ms->ms_unflushed_dirty); } uint64_t metaslab_unflushed_txg(metaslab_t *ms) { return (ms->ms_unflushed_txg); } ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW, "Allocation granularity (a.k.a. stripe size)"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW, "Load all metaslabs when pool is first opened"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, "Prevent metaslabs from being unloaded"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, "Preload potential metaslabs during reassessment"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW, "Max number of metaslabs per group to preload"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW, "Delay in txgs after metaslab was last used before unloading"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW, "Delay in milliseconds after metaslab was last used before unloading"); ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be free to make it " "eligible for allocation"); ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be considered eligible " "for allocations unless all metaslab groups within the metaslab class " "have also crossed this threshold"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW, "Use the fragmentation metric to prefer less fragmented metaslabs"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT, ZMOD_RW, "Fragmentation for metaslab to allow allocation"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW, "Prefer metaslabs with lower LBAs"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW, "Enable space-based metaslab group biasing"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, perf_bias, INT, ZMOD_RW, "Enable performance-based metaslab group biasing"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, ZMOD_RW, "Enable segment-based metaslab selection"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, "Segment-based metaslab selection maximum buckets before switching"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW, "Blocks larger than this size are sometimes forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW, "Percentage of large blocks that will be forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, "Max distance (bytes) to search forward before using size tree"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, "When looking in size tree, use largest segment instead of exact fit"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64, ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW, "Percentage of memory that can be used to store metaslab range trees"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, ZMOD_RW, "Try hard to allocate before ganging"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW, "Normally only consider this many of the best metaslabs in each vdev"); ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator, param_set_active_allocator, param_get_charp, ZMOD_RW, "SPA active allocator"); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 29c1d4ddf47c..bca022af6d75 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1,11119 +1,11123 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2018 Joyent, Inc. * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. * Copyright (c) 2023, 2024, Klara Inc. */ /* * SPA: Storage Pool Allocator * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" #include /* * spa_thread() existed on Illumos as a parent thread for the various worker * threads that actually run the pool, as a way to both reference the entire * pool work as a single object, and to share properties like scheduling * options. It has not yet been adapted to Linux or FreeBSD. This define is * used to mark related parts of the code to make things easier for the reader, * and to compile this code out. It can be removed when someone implements it, * moves it to some Illumos-specific place, or removes it entirely. */ #undef HAVE_SPA_THREAD /* * The "System Duty Cycle" scheduling class is an Illumos feature to help * prevent CPU-intensive kernel threads from affecting latency on interactive * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is * gated behind a define. On Illumos SDC depends on spa_thread(), but * spa_thread() also has other uses, so this is a separate define. */ #undef HAVE_SYSDC /* * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ ZTI_MODE_SYNC, /* sync thread assigned */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } #define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) #define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { zti_modes_t zti_mode; uint_t zti_value; uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "iss", "iss_h", "int", "int_h" }; /* * This table defines the taskq settings for each ZFS I/O type. When * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE * macros. Other operations process a large amount of data; the ZTI_SCALE * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs * that scales with the number of CPUs. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for high priority I/Os that * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT * implementation, so separate high priority threads are used there. */ static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ #ifdef illumos { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ #else { ZTI_SYNC, ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* WRITE */ #endif { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ }; static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport); static void spa_vdev_resilver_done(spa_t *spa); /* * Percentage of all CPUs that can be used by the metaslab preload taskq. */ static uint_t metaslab_preload_pct = 50; static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ static uint_t zio_taskq_batch_tpq; /* threads per taskq */ #ifdef HAVE_SYSDC static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ #endif #ifdef HAVE_SPA_THREAD static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ #endif static uint_t zio_taskq_write_tpq = 16; /* * Report any spa_load_verify errors found, but do not fail spa_load. * This is used by zdb to analyze non-idle pools. */ boolean_t spa_load_verify_dryrun = B_FALSE; /* * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). * This is used by zdb for spacemaps verification. */ boolean_t spa_mode_readable_spacemaps = B_FALSE; /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * For debugging purposes: print out vdev tree during pool import. */ static int spa_load_print_vdev_tree = B_FALSE; /* * A non-zero value for zfs_max_missing_tvds means that we allow importing * pools with missing top-level vdevs. This is strictly intended for advanced * pool recovery cases since missing data is almost inevitable. Pools with * missing devices can only be imported read-only for safety reasons, and their * fail-mode will be automatically set to "continue". * * With 1 missing vdev we should be able to import the pool and mount all * datasets. User data that was not modified after the missing device has been * added should be recoverable. This means that snapshots created prior to the * addition of that device should be completely intact. * * With 2 missing vdevs, some datasets may fail to mount since there are * dataset statistics that are stored as regular metadata. Some data might be * recoverable if those vdevs were added recently. * * With 3 or more missing vdevs, the pool is severely damaged and MOS entries * may be missing entirely. Chances of data recovery are very low. Note that * there are also risks of performing an inadvertent rewind as we might be * missing all the vdevs with the latest uberblocks. */ uint64_t zfs_max_missing_tvds = 0; /* * The parameters below are similar to zfs_max_missing_tvds but are only * intended for a preliminary open of the pool with an untrusted config which * might be incomplete or out-dated. * * We are more tolerant for pools opened from a cachefile since we could have * an out-dated cachefile where a device removal was not registered. * We could have set the limit arbitrarily high but in the case where devices * are really missing we would want to return the proper error codes; we chose * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available * and we get a chance to retrieve the trusted config. */ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; /* * In the case where config was assembled by scanning device paths (/dev/dsks * by default) we are less tolerant since all the existing devices should have * been detected and we want spa_load to return the right error codes. */ uint64_t zfs_max_missing_tvds_scan = 0; /* * Debugging aid that pauses spa_sync() towards the end. */ static const boolean_t zfs_pause_spa_sync = B_FALSE; /* * Variables to indicate the livelist condense zthr func should wait at certain * points for the livelist to be removed - used to test condense/destroy races */ static int zfs_livelist_condense_zthr_pause = 0; static int zfs_livelist_condense_sync_pause = 0; /* * Variables to track whether or not condense cancellation has been * triggered in testing. */ static int zfs_livelist_condense_sync_cancel = 0; static int zfs_livelist_condense_zthr_cancel = 0; /* * Variable to track whether or not extra ALLOC blkptrs were added to a * livelist entry while it was being condensed (caused by the way we track * remapped blkptrs in dbuf_remap_impl) */ static int zfs_livelist_condense_new_alloc = 0; /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } static int spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl) { zpool_prop_t prop = zpool_name_to_prop(propname); zprop_source_t src = ZPROP_SRC_NONE; uint64_t intval; int err; /* * NB: Not all properties lookups via this API require * the spa props lock, so they must explicitly grab it here. */ switch (prop) { case ZPOOL_PROP_DEDUPCACHED: err = ddt_get_pool_dedup_cached(spa, &intval); if (err != 0) return (SET_ERROR(err)); break; default: return (SET_ERROR(EINVAL)); } spa_prop_add_list(outnvl, prop, NULL, intval, src); return (0); } int spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, nvlist_t *outnvl) { int err = 0; if (props == NULL) return (0); for (unsigned int i = 0; i < n_props && err == 0; i++) { err = spa_prop_add(spa, props[i], outnvl); } return (err); } /* * Add a user property (source=src, propname=propval) to an nvlist. */ static void spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, zprop_source_t src) { nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t *nv) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size, alloc, cap, version; const zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { alloc = metaslab_class_get_alloc(mc); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); size += metaslab_class_get_space(spa_embedded_log_class(spa)); spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL, size - alloc, src); spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL, spa->spa_checkpoint_info.sci_dspace, src); spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == SPA_MODE_READ), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL, brt_get_used(spa), src); spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL, brt_get_saved(spa), src); spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL, brt_get_ratio(spa), src); spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, ddt_get_ddt_dsize(spa), src); spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); spa_prop_add_list(nv, ZPOOL_PROP_LAST_SCRUBBED_TXG, NULL, spa_get_last_scrubbed_txg(spa), src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_DEFAULT); } else { spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_LOCAL); } spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID, NULL, spa_load_guid(spa), src); } if (pool != NULL) { /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL, 0, src); } } spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_compatibility != NULL) { spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY, spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MAX_SIZE, ZPROP_SRC_NONE); } else { spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MIN_SIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t *nv) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t *za; dsl_pool_t *dp; int err = 0; dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); za = zap_attribute_alloc(); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nv); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) goto out; /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za->za_name)) == ZPOOL_PROP_INVAL && !zfs_prop_user(za->za_name)) continue; switch (za->za_integer_length) { case 8: /* integer property */ if (za->za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_dataset_t *ds = NULL; err = dsl_dataset_hold_obj(dp, za->za_first_integer, FTAG, &ds); if (err != 0) break; strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); } else { strval = NULL; intval = za->za_first_integer; } spa_prop_add_list(nv, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; case 1: /* string property */ strval = kmem_alloc(za->za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za->za_name, 1, za->za_num_integers, strval); if (err) { kmem_free(strval, za->za_num_integers); break; } if (prop != ZPOOL_PROP_INVAL) { spa_prop_add_list(nv, prop, strval, 0, src); } else { src = ZPROP_SRC_LOCAL; spa_prop_add_user(nv, za->za_name, strval, src); } kmem_free(strval, za->za_num_integers); break; default: break; } } zap_cursor_fini(&zc); out: mutex_exit(&spa->spa_props_lock); dsl_pool_config_exit(dp, FTAG); zap_attribute_free(za); if (err && err != ENOENT) return (err); return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; const char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPOOL_PROP_INVAL: /* * Sanitize the input. */ if (zfs_prop_user(propname)) { if (strlen(propname) >= ZAP_MAXNAMELEN) { error = SET_ERROR(ENAMETOOLONG); break; } if (strlen(fnvpair_value_string(elem)) >= ZAP_MAXVALUELEN) { error = SET_ERROR(E2BIG); break; } } else if (zpool_prop_feature(propname)) { if (nvpair_type(elem) != DATA_TYPE_UINT64) { error = SET_ERROR(EINVAL); break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = SET_ERROR(EINVAL); break; } if (intval != 0) { error = SET_ERROR(EINVAL); break; } fname = strchr(propname, '@') + 1; if (zfeature_lookup_name(fname, NULL) != 0) { error = SET_ERROR(EINVAL); break; } has_feature = B_TRUE; } else { error = SET_ERROR(EINVAL); break; } break; case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION_BEFORE_FEATURES || has_feature)) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_DEDUP_TABLE_QUOTA: error = nvpair_value_uint64(elem, &intval); break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: case ZPOOL_PROP_AUTOTRIM: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_MULTIHOST: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); if (!error) { uint32_t hostid = zone_get_hostid(NULL); if (hostid) spa->spa_hostid = hostid; else error = SET_ERROR(ENOTSUP); } break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = SET_ERROR(ENOTSUP); break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = SET_ERROR(ENOTSUP); break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { objset_t *os; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } error = dmu_objset_hold(strval, FTAG, &os); if (error != 0) break; /* Must be ZPL. */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && intval > ZIO_FAILURE_MODE_PANIC) error = SET_ERROR(EINVAL); /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = SET_ERROR(EIO); } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = SET_ERROR(EINVAL); break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_COMMENT: if ((error = nvpair_value_string(elem, &strval)) != 0) break; for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { error = SET_ERROR(EINVAL); break; } } if (strlen(strval) > ZPROP_MAX_COMMENT) error = SET_ERROR(E2BIG); break; default: break; } if (error) break; } (void) nvlist_remove_all(props, zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { const char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; if (prop == ZPOOL_PROP_INVAL && zfs_prop_user(nvpair_name(elem))) { need_sync = B_TRUE; break; } if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { uint64_t ver = 0; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; need_sync = B_TRUE; } /* Save time if the version is already set. */ if (ver == spa_version(spa)) continue; /* * In addition to the pool directory object, we might * create the pool properties object, the features for * read object, the features for write object, or the * feature descriptions object. */ error = dsl_sync_task(spa->spa_name, NULL, spa_sync_version, &ver, 6, ZFS_SPACE_CHECK_RESERVED); if (error) return (error); continue; } need_sync = B_TRUE; break; } if (need_sync) { return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, nvp, 6, ZFS_SPACE_CHECK_RESERVED)); } return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { uint64_t *newguid __maybe_unused = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { int error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (SET_ERROR(error)); } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); if (vdev_state != VDEV_STATE_HEALTHY) return (SET_ERROR(ENXIO)); ASSERT3U(spa_guid(spa), !=, *newguid); return (0); } static void spa_change_guid_sync(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; oldguid = spa_guid(spa); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); rvd->vdev_guid = *newguid; rvd->vdev_guid_sum += (*newguid - oldguid); vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", (u_longlong_t)oldguid, (u_longlong_t)*newguid); } /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify * the root vdev's guid, our own pool guid, and then mark all of our * vdevs dirty. Note that we must make sure that all our vdevs are * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. * * The GUID of the pool will be changed to the value pointed to by guidp. * The GUID may not be set to the reserverd value of 0. * The new GUID will be generated if guidp is NULL. */ int spa_change_guid(spa_t *spa, const uint64_t *guidp) { uint64_t guid; int error; mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); if (guidp != NULL) { guid = *guidp; if (guid == 0) { error = SET_ERROR(EINVAL); goto out; } if (spa_guid_exists(guid, 0)) { error = SET_ERROR(EEXIST); goto out; } } else { guid = spa_generate_guid(NULL); } error = dsl_sync_task(spa->spa_name, spa_change_guid_check, spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { /* * Clear the kobj flag from all the vdevs to allow * vdev_cache_process_kobj_evt() to post events to all the * vdevs since GUID is updated. */ vdev_clear_kobj_evt(spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } out: mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { const spa_error_entry_t *sa = (const spa_error_entry_t *)a; const spa_error_entry_t *sb = (const spa_error_entry_t *)b; int ret; ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); return (TREE_ISIGN(ret)); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static void spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t cpus, flags = TASKQ_DYNAMIC; switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >, 0); break; case ZTI_MODE_SYNC: /* * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, * not to exceed the number of spa allocators, and align to it. */ cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq)); count = MAX(count, (zio_taskq_batch_pct + 99) / 100); count = MIN(count, spa->spa_alloc_count); while (spa->spa_alloc_count % count != 0 && spa->spa_alloc_count < count * 2) count--; /* * zio_taskq_batch_pct is unbounded and may exceed 100%, but no * single taskq may have more threads than 100% of online cpus. */ value = (zio_taskq_batch_pct + count / 2) / count; value = MIN(value, 100); flags |= TASKQ_THREADS_CPU_PCT; break; case ZTI_MODE_SCALE: flags |= TASKQ_THREADS_CPU_PCT; /* * We want more taskqs to reduce lock contention, but we want * less for better request ordering and CPU utilization. */ cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); if (zio_taskq_batch_tpq > 0) { count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / zio_taskq_batch_tpq); } else { /* * Prefer 6 threads per taskq, but no more taskqs * than threads in them on large systems. For 80%: * * taskq taskq total * cpus taskqs percent threads threads * ------- ------- ------- ------- ------- * 1 1 80% 1 1 * 2 1 80% 1 1 * 4 1 80% 3 3 * 8 2 40% 3 6 * 16 3 27% 4 12 * 32 5 16% 5 25 * 64 7 11% 7 49 * 128 10 8% 10 100 * 256 14 6% 15 210 */ count = 1 + cpus / 6; while (count * count > cpus) count--; } /* Limit each taskq within 100% to not trigger assertion. */ count = MAX(count, (zio_taskq_batch_pct + 99) / 100); value = (zio_taskq_batch_pct + count / 2) / count; break; case ZTI_MODE_NULL: tqs->stqs_count = 0; tqs->stqs_taskq = NULL; return; default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_taskqs_init()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } ASSERT3U(count, >, 0); tqs->stqs_count = count; tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); for (uint_t i = 0; i < count; i++) { taskq_t *tq; char name[32]; if (count > 1) (void) snprintf(name, sizeof (name), "%s_%s_%u", zio_type_name[t], zio_taskq_types[q], i); else (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); #ifdef HAVE_SYSDC if (zio_taskq_sysdc && spa->spa_proc != &p0) { (void) zio_taskq_basedc; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { #endif pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly less important * priority than the other taskqs. * * Under Linux and FreeBSD this means incrementing * the priority value as opposed to platforms like * illumos where it should be decremented. * * On FreeBSD, if priorities divided by four (RQ_PPQ) * are equal then a difference between them is * insignificant. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { #if defined(__linux__) pri++; #elif defined(__FreeBSD__) pri += 4; #else #error "unknown OS" #endif } tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); #ifdef HAVE_SYSDC } #endif tqs->stqs_taskq[i] = tq; } } static void spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; if (tqs->stqs_taskq == NULL) { ASSERT3U(tqs->stqs_count, ==, 0); return; } for (uint_t i = 0; i < tqs->stqs_count; i++) { ASSERT3P(tqs->stqs_taskq[i], !=, NULL); taskq_destroy(tqs->stqs_taskq[i]); } kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); tqs->stqs_taskq = NULL; } #ifdef _KERNEL /* * The READ and WRITE rows of zio_taskqs are configurable at module load time * by setting zio_taskq_read or zio_taskq_write. * * Example (the defaults for READ and WRITE) * zio_taskq_read='fixed,1,8 null scale null' * zio_taskq_write='sync null scale null' * * Each sets the entire row at a time. * * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number * of threads per taskq. * * 'null' can only be set on the high-priority queues (queue selection for * high-priority queues will fall back to the regular queue if the high-pri * is NULL. */ static const char *const modes[ZTI_NMODES] = { "fixed", "scale", "sync", "null" }; /* Parse the incoming config string. Modifies cfg */ static int spa_taskq_param_set(zio_type_t t, char *cfg) { int err = 0; zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; char *next = cfg, *tok, *c; /* * Parse out each element from the string and fill `row`. The entire * row has to be set at once, so any errors are flagged by just * breaking out of this loop early. */ uint_t q; for (q = 0; q < ZIO_TASKQ_TYPES; q++) { /* `next` is the start of the config */ if (next == NULL) break; /* Eat up leading space */ while (isspace(*next)) next++; if (*next == '\0') break; /* Mode ends at space or end of string */ tok = next; next = strchr(tok, ' '); if (next != NULL) *next++ = '\0'; /* Parameters start after a comma */ c = strchr(tok, ','); if (c != NULL) *c++ = '\0'; /* Match mode string */ uint_t mode; for (mode = 0; mode < ZTI_NMODES; mode++) if (strcmp(tok, modes[mode]) == 0) break; if (mode == ZTI_NMODES) break; /* Invalid canary */ row[q].zti_mode = ZTI_NMODES; /* Per-mode setup */ switch (mode) { /* * FIXED is parameterised: number of queues, and number of * threads per queue. */ case ZTI_MODE_FIXED: { /* No parameters? */ if (c == NULL || *c == '\0') break; /* Find next parameter */ tok = c; c = strchr(tok, ','); if (c == NULL) break; /* Take digits and convert */ unsigned long long nq; if (!(isdigit(*tok))) break; err = ddi_strtoull(tok, &tok, 10, &nq); /* Must succeed and also end at the next param sep */ if (err != 0 || tok != c) break; /* Move past the comma */ tok++; /* Need another number */ if (!(isdigit(*tok))) break; /* Remember start to make sure we moved */ c = tok; /* Take digits */ unsigned long long ntpq; err = ddi_strtoull(tok, &tok, 10, &ntpq); /* Must succeed, and moved forward */ if (err != 0 || tok == c || *tok != '\0') break; /* * sanity; zero queues/threads make no sense, and * 16K is almost certainly more than anyone will ever * need and avoids silly numbers like UINT32_MAX */ if (nq == 0 || nq >= 16384 || ntpq == 0 || ntpq >= 16384) break; const zio_taskq_info_t zti = ZTI_P(ntpq, nq); row[q] = zti; break; } case ZTI_MODE_SCALE: { const zio_taskq_info_t zti = ZTI_SCALE; row[q] = zti; break; } case ZTI_MODE_SYNC: { const zio_taskq_info_t zti = ZTI_SYNC; row[q] = zti; break; } case ZTI_MODE_NULL: { /* * Can only null the high-priority queues; the general- * purpose ones have to exist. */ if (q != ZIO_TASKQ_ISSUE_HIGH && q != ZIO_TASKQ_INTERRUPT_HIGH) break; const zio_taskq_info_t zti = ZTI_NULL; row[q] = zti; break; } default: break; } /* Ensure we set a mode */ if (row[q].zti_mode == ZTI_NMODES) break; } /* Didn't get a full row, fail */ if (q < ZIO_TASKQ_TYPES) return (SET_ERROR(EINVAL)); /* Eat trailing space */ if (next != NULL) while (isspace(*next)) next++; /* If there's anything left over then fail */ if (next != NULL && *next != '\0') return (SET_ERROR(EINVAL)); /* Success! Copy it into the real config */ for (q = 0; q < ZIO_TASKQ_TYPES; q++) zio_taskqs[t][q] = row[q]; return (0); } static int spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) { int pos = 0; /* Build paramater string from live config */ const char *sep = ""; for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { const zio_taskq_info_t *zti = &zio_taskqs[t][q]; if (zti->zti_mode == ZTI_MODE_FIXED) pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, modes[zti->zti_mode], zti->zti_count, zti->zti_value); else pos += sprintf(&buf[pos], "%s%s", sep, modes[zti->zti_mode]); sep = " "; } if (add_newline) buf[pos++] = '\n'; buf[pos] = '\0'; return (pos); } #ifdef __linux__ static int spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) { char *cfg = kmem_strdup(val); int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); kmem_free(cfg, strlen(val)+1); return (-err); } static int spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) { return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); } static int spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) { char *cfg = kmem_strdup(val); int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); kmem_free(cfg, strlen(val)+1); return (-err); } static int spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) { return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); } #else /* * On FreeBSD load-time parameters can be set up before malloc() is available, * so we have to do all the parsing work on the stack. */ #define SPA_TASKQ_PARAM_MAX (128) static int spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; int err; (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); } static int spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; int err; (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); } #endif #endif /* _KERNEL */ /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. */ void spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, zio_t *zio, boolean_t cutinline) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ ASSERT(zio); ASSERT(taskq_empty_ent(&zio->io_tqent)); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && ZIO_HAS_ALLOCATOR(zio)) { tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0, &zio->io_tqent); } static void spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_init(spa, t, q); } } } #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) static void spa_thread(void *arg) { psetid_t zio_taskq_psrset_bind = PS_NONE; callb_cpr_t cprinfo; spa_t *spa = arg; user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } #ifdef HAVE_SYSDC if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } #endif spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ mutex_enter(&curproc->p_lock); lwp_exit(); } #endif extern metaslab_ops_t *metaslab_allocator(spa_t *spa); /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, spa_mode_t mode) { metaslab_ops_t *msp = metaslab_allocator(spa); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_final_txg = UINT64_MAX; spa->spa_mode = mode; spa->spa_read_spacemaps = spa_mode_readable_spacemaps; - spa->spa_normal_class = metaslab_class_create(spa, msp, B_FALSE); - spa->spa_log_class = metaslab_class_create(spa, msp, B_TRUE); - spa->spa_embedded_log_class = metaslab_class_create(spa, msp, B_TRUE); - spa->spa_special_class = metaslab_class_create(spa, msp, B_FALSE); - spa->spa_dedup_class = metaslab_class_create(spa, msp, B_FALSE); + spa->spa_normal_class = metaslab_class_create(spa, "normal", + msp, B_FALSE); + spa->spa_log_class = metaslab_class_create(spa, "log", msp, B_TRUE); + spa->spa_embedded_log_class = metaslab_class_create(spa, + "embedded_log", msp, B_TRUE); + spa->spa_special_class = metaslab_class_create(spa, "special", + msp, B_FALSE); + spa->spa_dedup_class = metaslab_class_create(spa, "dedup", + msp, B_FALSE); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; #ifdef HAVE_SPA_THREAD /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } #endif /* HAVE_SPA_THREAD */ mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } for (size_t i = 0; i < TXG_SIZE; i++) { spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, spa, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_healed, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); spa_activate_os(spa); spa_keystore_init(&spa->spa_keystore); /* * This taskq is used to perform zvol-minor-related tasks * asynchronously. This has several advantages, including easy * resolution of various deadlocks. * * The taskq must be single threaded to ensure tasks are always * processed in the order in which they were dispatched. * * A taskq per pool allows one to keep the pools independent. * This way if one pool is suspended, it will not impact another. * * The preferred location to dispatch a zvol minor task is a sync * task. In this context, there is easy access to the spa_t and minimal * error handling is required because the sync task must succeed. */ spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1, INT_MAX, 0); /* * The taskq to preload metaslabs. */ spa->spa_metaslab_taskq = taskq_create("z_metaslab", metaslab_preload_pct, maxclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); /* * Taskq dedicated to prefetcher threads: this is used to prevent the * pool traverse code from monopolizing the global (and limited) * system_taskq by inappropriately scheduling long running tasks on it. */ spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); /* * The taskq to upgrade datasets in this pool. Currently used by * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. */ spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); spa_evicting_os_wait(spa); if (spa->spa_zvol_taskq) { taskq_destroy(spa->spa_zvol_taskq); spa->spa_zvol_taskq = NULL; } if (spa->spa_metaslab_taskq) { taskq_destroy(spa->spa_metaslab_taskq); spa->spa_metaslab_taskq = NULL; } if (spa->spa_prefetch_taskq) { taskq_destroy(spa->spa_prefetch_taskq); spa->spa_prefetch_taskq = NULL; } if (spa->spa_upgrade_taskq) { taskq_destroy(spa->spa_upgrade_taskq); spa->spa_upgrade_taskq = NULL; } txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_fini(spa, t, q); } } for (size_t i = 0; i < TXG_SIZE; i++) { ASSERT3P(spa->spa_txg_zio[i], !=, NULL); VERIFY0(zio_wait(spa->spa_txg_zio[i])); spa->spa_txg_zio[i] = NULL; } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; metaslab_class_destroy(spa->spa_embedded_log_class); spa->spa_embedded_log_class = NULL; metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; metaslab_class_destroy(spa->spa_dedup_class); spa->spa_dedup_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); avl_destroy(&spa->spa_errlist_healed); spa_keystore_fini(&spa->spa_keystore); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } spa_deactivate_os(spa); } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (SET_ERROR(EINVAL)); } for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } static boolean_t spa_should_flush_logs_on_unload(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return (B_FALSE); if (!spa_writeable(spa)) return (B_FALSE); if (!spa->spa_sync_on) return (B_FALSE); if (spa_state(spa) != POOL_STATE_EXPORTED) return (B_FALSE); if (zfs_keep_log_spacemaps_at_export) return (B_FALSE); return (B_TRUE); } /* * Opens a transaction that will set the flag that will instruct * spa_sync to attempt to flush all the metaslabs for that txg. */ static void spa_unload_log_sm_flush_all(spa_t *spa) { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); ASSERT3U(spa->spa_log_flushall_txg, ==, 0); spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); } static void spa_unload_log_sm_metadata(spa_t *spa) { void *cookie = NULL; spa_log_sm_t *sls; log_summary_entry_t *e; while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, &cookie)) != NULL) { VERIFY0(sls->sls_mscount); kmem_free(sls, sizeof (spa_log_sm_t)); } while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { VERIFY0(e->lse_mscount); kmem_free(e, sizeof (log_summary_entry_t)); } spa->spa_unflushed_stats.sus_nblocks = 0; spa->spa_unflushed_stats.sus_memused = 0; spa->spa_unflushed_stats.sus_blocklimit = 0; } static void spa_destroy_aux_threads(spa_t *spa) { if (spa->spa_condense_zthr != NULL) { zthr_destroy(spa->spa_condense_zthr); spa->spa_condense_zthr = NULL; } if (spa->spa_checkpoint_discard_zthr != NULL) { zthr_destroy(spa->spa_checkpoint_discard_zthr); spa->spa_checkpoint_discard_zthr = NULL; } if (spa->spa_livelist_delete_zthr != NULL) { zthr_destroy(spa->spa_livelist_delete_zthr); spa->spa_livelist_delete_zthr = NULL; } if (spa->spa_livelist_condense_zthr != NULL) { zthr_destroy(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = NULL; } if (spa->spa_raidz_expand_zthr != NULL) { zthr_destroy(spa->spa_raidz_expand_zthr); spa->spa_raidz_expand_zthr = NULL; } } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_export_thread == curthread); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); spa_load_note(spa, "UNLOADING"); spa_wake_waiters(spa); /* * If we have set the spa_final_txg, we have already performed the * tasks below in spa_export_common(). We should not redo it here since * we delay the final TXGs beyond what spa_final_txg is set at. */ if (spa->spa_final_txg == UINT64_MAX) { /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some * time flushing as many metaslabs as we can in an attempt to * destroy log space maps and save import time. */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); /* * Stop async tasks. */ spa_async_suspend(spa); if (spa->spa_root_vdev) { vdev_t *root_vdev = spa->spa_root_vdev; vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); l2arc_spa_rebuild_stop(spa); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; spa_config_exit(spa, SCL_ALL, FTAG); } /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * This ensures that there is no async metaslab prefetching * while we attempt to unload the spa. */ taskq_wait(spa->spa_metaslab_taskq); if (spa->spa_mmp.mmp_thread) mmp_thread_stop(spa); /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { for (int i = 0; i < max_ncpus; i++) (void) zio_wait(spa->spa_async_zio_root[i]); kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } if (spa->spa_vdev_removal != NULL) { spa_vdev_removal_destroy(spa->spa_vdev_removal); spa->spa_vdev_removal = NULL; } spa_destroy_aux_threads(spa); spa_condense_fini(spa); bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); brt_unload(spa); spa_unload_log_sm_metadata(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); if (spa->spa_spares.sav_vdevs) { for (int i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; if (spa->spa_l2cache.sav_vdevs) { for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; spa->spa_indirect_vdevs_loaded = B_FALSE; if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } if (spa->spa_compatibility != NULL) { spa_strfree(spa->spa_compatibility); spa->spa_compatibility = NULL; } spa->spa_raidz_expand = NULL; spa->spa_checkpoint_txg = 0; spa_config_exit(spa, SCL_ALL, spa); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As spare vdevs are shared among open pools, we skip loading * them when we load the checkpointed state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ if (spa->spa_spares.sav_vdevs) { for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); } if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares)); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, spa->spa_spares.sav_count); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache = NULL; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As L2 caches are part of the ARC which is shared among open * pools, we skip loading them when we load the checkpointed * state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; if (sav->sav_config == NULL) { nl2cache = 0; newvdevs = NULL; goto out; } VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); /* * Upon cache device addition to a pool or pool * creation with a cache device or if the header * of the device is invalid we issue an async * TRIM command for the whole device which will * execute if l2arc_trim_ahead > 0. */ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); if (sav->sav_count > 0) l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, sav->sav_count); out: /* * Purge vdevs that were dropped */ if (oldvdevs) { for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { ASSERT(vd->vdev_isl2cache); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); vdev_clear_stats(vd); vdev_free(vd); } } kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); } for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); if (error) return (error); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); packed = vmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); vmem_free(packed, nvsize); return (error); } /* * Concrete top-level vdevs that are not missing and are not logs. At every * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. */ static uint64_t spa_healthy_core_tvds(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t tvds = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; if (vd->vdev_islog) continue; if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) tvds++; } return (tvds); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { for (uint64_t c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && vdev_is_concrete(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } } static int spa_check_for_missing_logs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; /* * If we're doing a normal import, then build up any additional * diagnostic information about missing log devices. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); nv = fnvlist_alloc(); for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * We consider a device as missing only if it failed * to open (i.e. offline or faulted is not considered * as missing). */ if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { child[idx++] = vdev_config_generate(spa, tvd, B_FALSE, VDEV_CONFIG_MISSING); } } if (idx > 0) { fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, (const nvlist_t * const *)child, idx); fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv); for (uint64_t i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); if (idx > 0) { spa_load_failed(spa, "some log devices are missing"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } } else { for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { spa_set_log_state(spa, SPA_LOG_CLEAR); spa_load_note(spa, "some log devices are " "missing, ZIL is dropped."); vdev_dbgmsg_print_tree(rvd, 2); break; } } } return (0); } /* * Check for missing log devices */ static boolean_t spa_check_logs(spa_t *spa) { boolean_t rv = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); switch (spa->spa_log_state) { default: break; case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); break; } return (rv); } /* * Passivate any log vdevs (note, does not apply to embedded log metaslabs). */ static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog) { ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(tvd->vdev_mg); slog_found = B_TRUE; } } return (slog_found); } /* * Activate any log vdevs (note, does not apply to embedded log metaslabs). */ static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog) { ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_activate(tvd->vdev_mg); } } } int spa_reset_logs(spa_t *spa) { int error; error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { for (int i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp)) spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp); mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { boolean_t sle_verify_data; uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; spa_t *spa = zio->io_spa; abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_inc_64(&sle->sle_meta_count); else atomic_inc_64(&sle->sle_data_count); } mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } /* * Maximum number of inflight bytes is the log2 fraction of the arc size. * By default, we set it to 1/16th of the arc. */ static uint_t spa_load_verify_shift = 4; static int spa_load_verify_metadata = B_TRUE; static int spa_load_verify_data = B_TRUE; static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zio_t *rio = arg; spa_load_error_t *sle = rio->io_private; (void) zilog, (void) dnp; /* * Note: normally this routine will not be called if * spa_load_verify_metadata is not set. However, it may be useful * to manually set the flag after the traversal has begun. */ if (!spa_load_verify_metadata) return (0); /* * Sanity check the block pointer in order to detect obvious damage * before using the contents in subsequent checks or in zio_read(). * When damaged consider it to be a metadata error since we cannot * trust the BP_GET_TYPE and BP_GET_LEVEL values. */ if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { atomic_inc_64(&sle->sle_meta_count); return (0); } if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return (0); if (!BP_IS_METADATA(bp) && (!spa_load_verify_data || !sle->sle_verify_data)) return (0); uint64_t maxinflight_bytes = arc_target_bytes() >> spa_load_verify_shift; size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_bytes >= maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_load_verify_bytes += size; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); return (0); } static int verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { (void) dp, (void) arg; if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_load_policy_t policy; boolean_t verify_ok = B_FALSE; int error = 0; zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || policy.zlp_maxmeta == UINT64_MAX) return (0); dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); error = dmu_objset_find_dp(spa->spa_dsl_pool, spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, DS_FIND_CHILDREN); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); if (error != 0) return (error); /* * Verify data only if we are rewinding or error limit was set. * Otherwise nothing except dbgmsg care about it to waste time. */ sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || (policy.zlp_maxdata < UINT64_MAX); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { if (spa->spa_extreme_rewind) { spa_load_note(spa, "performing a complete scan of the " "pool since extreme rewind is on. This may take " "a very long time.\n (spa_load_verify_data=%u, " "spa_load_verify_metadata=%u)", spa_load_verify_data, spa_load_verify_metadata); } error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); } (void) zio_wait(rio); ASSERT0(spa->spa_load_verify_bytes); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { spa_load_note(spa, "spa_load_verify found %llu metadata errors " "and %llu data errors", (u_longlong_t)sle.sle_meta_count, (u_longlong_t)sle.sle_data_count); } if (spa_load_verify_dryrun || (!error && sle.sle_meta_count <= policy.zlp_maxmeta && sle.sle_data_count <= policy.zlp_maxdata)) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts); fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (spa_load_verify_dryrun) return (0); if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val); if (error != 0 && (error != ENOENT || log_enoent)) { spa_load_failed(spa, "couldn't get '%s' value in MOS directory " "[error=%d]", name, error); } return (error); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (SET_ERROR(err)); } boolean_t spa_livelist_delete_check(spa_t *spa) { return (spa->spa_livelists_to_delete != 0); } static boolean_t spa_livelist_delete_cb_check(void *arg, zthr_t *z) { (void) z; spa_t *spa = arg; return (spa_livelist_delete_check(spa)); } static int delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { spa_t *spa = arg; zio_free(spa, tx->tx_txg, bp); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); return (0); } static int dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) { int err; zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); zap_cursor_init(&zc, os, zap_obj); err = zap_cursor_retrieve(&zc, za); zap_cursor_fini(&zc); if (err == 0) *llp = za->za_first_integer; zap_attribute_free(za); return (err); } /* * Components of livelist deletion that must be performed in syncing * context: freeing block pointers and updating the pool-wide data * structures to indicate how much work is left to do */ typedef struct sublist_delete_arg { spa_t *spa; dsl_deadlist_t *ll; uint64_t key; bplist_t *to_free; } sublist_delete_arg_t; static void sublist_delete_sync(void *arg, dmu_tx_t *tx) { sublist_delete_arg_t *sda = arg; spa_t *spa = sda->spa; dsl_deadlist_t *ll = sda->ll; uint64_t key = sda->key; bplist_t *to_free = sda->to_free; bplist_iterate(to_free, delete_blkptr_cb, spa, tx); dsl_deadlist_remove_entry(ll, key, tx); } typedef struct livelist_delete_arg { spa_t *spa; uint64_t ll_obj; uint64_t zap_obj; } livelist_delete_arg_t; static void livelist_delete_sync(void *arg, dmu_tx_t *tx) { livelist_delete_arg_t *lda = arg; spa_t *spa = lda->spa; uint64_t ll_obj = lda->ll_obj; uint64_t zap_obj = lda->zap_obj; objset_t *mos = spa->spa_meta_objset; uint64_t count; /* free the livelist and decrement the feature count */ VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); dsl_deadlist_free(mos, ll_obj, tx); spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); VERIFY0(zap_count(mos, zap_obj, &count)); if (count == 0) { /* no more livelists to delete */ VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, tx)); VERIFY0(zap_destroy(mos, zap_obj, tx)); spa->spa_livelists_to_delete = 0; spa_notify_waiters(spa); } } /* * Load in the value for the livelist to be removed and open it. Then, * load its first sublist and determine which block pointers should actually * be freed. Then, call a synctask which performs the actual frees and updates * the pool-wide livelist data. */ static void spa_livelist_delete_cb(void *arg, zthr_t *z) { spa_t *spa = arg; uint64_t ll_obj = 0, count; objset_t *mos = spa->spa_meta_objset; uint64_t zap_obj = spa->spa_livelists_to_delete; /* * Determine the next livelist to delete. This function should only * be called if there is at least one deleted clone. */ VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); VERIFY0(zap_count(mos, ll_obj, &count)); if (count > 0) { dsl_deadlist_t *ll; dsl_deadlist_entry_t *dle; bplist_t to_free; ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); VERIFY0(dsl_deadlist_open(ll, mos, ll_obj)); dle = dsl_deadlist_first(ll); ASSERT3P(dle, !=, NULL); bplist_create(&to_free); int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, z, NULL); if (err == 0) { sublist_delete_arg_t sync_arg = { .spa = spa, .ll = ll, .key = dle->dle_mintxg, .to_free = &to_free }; zfs_dbgmsg("deleting sublist (id %llu) from" " livelist %llu, %lld remaining", (u_longlong_t)dle->dle_bpobj.bpo_object, (u_longlong_t)ll_obj, (longlong_t)count - 1); VERIFY0(dsl_sync_task(spa_name(spa), NULL, sublist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } else { VERIFY3U(err, ==, EINTR); } bplist_clear(&to_free); bplist_destroy(&to_free); dsl_deadlist_close(ll); kmem_free(ll, sizeof (dsl_deadlist_t)); } else { livelist_delete_arg_t sync_arg = { .spa = spa, .ll_obj = ll_obj, .zap_obj = zap_obj }; zfs_dbgmsg("deletion of livelist %llu completed", (u_longlong_t)ll_obj); VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } } static void spa_start_livelist_destroy_thread(spa_t *spa) { ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); spa->spa_livelist_delete_zthr = zthr_create("z_livelist_destroy", spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, minclsyspri); } typedef struct livelist_new_arg { bplist_t *allocs; bplist_t *frees; } livelist_new_arg_t; static int livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(tx == NULL); livelist_new_arg_t *lna = arg; if (bp_freed) { bplist_append(lna->frees, bp); } else { bplist_append(lna->allocs, bp); zfs_livelist_condense_new_alloc++; } return (0); } typedef struct livelist_condense_arg { spa_t *spa; bplist_t to_keep; uint64_t first_size; uint64_t next_size; } livelist_condense_arg_t; static void spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) { livelist_condense_arg_t *lca = arg; spa_t *spa = lca->spa; bplist_t new_frees; dsl_dataset_t *ds = spa->spa_to_condense.ds; /* Have we been cancelled? */ if (spa->spa_to_condense.cancelled) { zfs_livelist_condense_sync_cancel++; goto out; } dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; /* * It's possible that the livelist was changed while the zthr was * running. Therefore, we need to check for new blkptrs in the two * entries being condensed and continue to track them in the livelist. * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), * it's possible that the newly added blkptrs are FREEs or ALLOCs so * we need to sort them into two different bplists. */ uint64_t first_obj = first->dle_bpobj.bpo_object; uint64_t next_obj = next->dle_bpobj.bpo_object; uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; bplist_create(&new_frees); livelist_new_arg_t new_bps = { .allocs = &lca->to_keep, .frees = &new_frees, }; if (cur_first_size > lca->first_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, livelist_track_new_cb, &new_bps, lca->first_size)); } if (cur_next_size > lca->next_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, livelist_track_new_cb, &new_bps, lca->next_size)); } dsl_deadlist_clear_entry(first, ll, tx); ASSERT(bpobj_is_empty(&first->dle_bpobj)); dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); bplist_destroy(&new_frees); char dsname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, dsname); zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, (u_longlong_t)cur_next_size, (u_longlong_t)first->dle_bpobj.bpo_object, (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); out: dmu_buf_rele(ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); spa->spa_to_condense.syncing = B_FALSE; } static void spa_livelist_condense_cb(void *arg, zthr_t *t) { while (zfs_livelist_condense_zthr_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); spa_t *spa = arg; dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; uint64_t first_size, next_size; livelist_condense_arg_t *lca = kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); bplist_create(&lca->to_keep); /* * Process the livelists (matching FREEs and ALLOCs) in open context * so we have minimal work in syncing context to condense. * * We save bpobj sizes (first_size and next_size) to use later in * syncing context to determine if entries were added to these sublists * while in open context. This is possible because the clone is still * active and open for normal writes and we want to make sure the new, * unprocessed blockpointers are inserted into the livelist normally. * * Note that dsl_process_sub_livelist() both stores the size number of * blockpointers and iterates over them while the bpobj's lock held, so * the sizes returned to us are consistent which what was actually * processed. */ int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, &first_size); if (err == 0) err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, t, &next_size); if (err == 0) { while (zfs_livelist_condense_sync_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_mark_netfree(tx); dmu_tx_hold_space(tx, 1); err = dmu_tx_assign(tx, DMU_TX_NOWAIT | DMU_TX_NOTHROTTLE); if (err == 0) { /* * Prevent the condense zthr restarting before * the synctask completes. */ spa->spa_to_condense.syncing = B_TRUE; lca->spa = spa; lca->first_size = first_size; lca->next_size = next_size; dsl_sync_task_nowait(spa_get_dsl(spa), spa_livelist_condense_sync, lca, tx); dmu_tx_commit(tx); return; } } /* * Condensing can not continue: either it was externally stopped or * we were unable to assign to a tx because the pool has run out of * space. In the second case, we'll just end up trying to condense * again in a later txg. */ ASSERT(err != 0); bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; if (err == EINTR) zfs_livelist_condense_zthr_cancel++; } /* * Check that there is something to condense but that a condense is not * already in progress and that condensing has not been cancelled. */ static boolean_t spa_livelist_condense_cb_check(void *arg, zthr_t *z) { (void) z; spa_t *spa = arg; if ((spa->spa_to_condense.ds != NULL) && (spa->spa_to_condense.syncing == B_FALSE) && (spa->spa_to_condense.cancelled == B_FALSE)) { return (B_TRUE); } return (B_FALSE); } static void spa_start_livelist_condensing_thread(spa_t *spa) { spa->spa_to_condense.ds = NULL; spa->spa_to_condense.first = NULL; spa->spa_to_condense.next = NULL; spa->spa_to_condense.syncing = B_FALSE; spa->spa_to_condense.cancelled = B_FALSE; ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); spa->spa_livelist_condense_zthr = zthr_create("z_livelist_condense", spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa, minclsyspri); } static void spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); spa_start_raidz_expansion_thread(spa); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); spa->spa_checkpoint_discard_zthr = zthr_create("z_checkpoint_discard", spa_checkpoint_discard_thread_check, spa_checkpoint_discard_thread, spa, minclsyspri); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) { const char *ereport = FM_EREPORT_ZFS_POOL; int error; spa->spa_load_state = state; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); spa_import_progress_set_notes(spa, "spa_load()"); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { (void) zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); return (error); } #ifdef ZFS_DEBUG /* * Count the number of per-vdev ZAPs associated with all of the vdevs in the * vdev tree rooted in the given vd, and ensure that each ZAP is present in the * spa's per-vdev ZAP list. */ static uint64_t vdev_count_verify_zaps(vdev_t *vd) { spa_t *spa = vd->vdev_spa; uint64_t total = 0; if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && vd->vdev_root_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_root_zap)); } if (vd->vdev_top_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_top_zap)); } if (vd->vdev_leaf_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { total += vdev_count_verify_zaps(vd->vdev_child[i]); } return (total); } #else #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) #endif /* * Determine whether the activity check is required. */ static boolean_t spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, nvlist_t *config) { uint64_t state = 0; uint64_t hostid = 0; uint64_t tryconfig_txg = 0; uint64_t tryconfig_timestamp = 0; uint16_t tryconfig_mmp_seq = 0; nvlist_t *nvinfo; if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, &tryconfig_txg); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, &tryconfig_timestamp); (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, &tryconfig_mmp_seq); } (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); /* * Disable the MMP activity check - This is used by zdb which * is intended to be used on potentially active pools. */ if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) return (B_FALSE); /* * Skip the activity check when the MMP feature is disabled. */ if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) return (B_FALSE); /* * If the tryconfig_ values are nonzero, they are the results of an * earlier tryimport. If they all match the uberblock we just found, * then the pool has not changed and we return false so we do not test * a second time. */ if (tryconfig_txg && tryconfig_txg == ub->ub_txg && tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && tryconfig_mmp_seq && tryconfig_mmp_seq == (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) return (B_FALSE); /* * Allow the activity check to be skipped when importing the pool * on the same host which last imported it. Since the hostid from * configuration may be stale use the one read from the label. */ if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); if (hostid == spa_get_hostid(spa)) return (B_FALSE); /* * Skip the activity test when the pool was cleanly exported. */ if (state != POOL_STATE_ACTIVE) return (B_FALSE); return (B_TRUE); } /* * Nanoseconds the activity check must watch for changes on-disk. */ static uint64_t spa_activity_check_duration(spa_t *spa, uberblock_t *ub) { uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); uint64_t multihost_interval = MSEC2NSEC( MMP_INTERVAL_OK(zfs_multihost_interval)); uint64_t import_delay = MAX(NANOSEC, import_intervals * multihost_interval); /* * Local tunables determine a minimum duration except for the case * where we know when the remote host will suspend the pool if MMP * writes do not land. * * See Big Theory comment at the top of mmp.c for the reasoning behind * these cases and times. */ ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) > 0) { /* MMP on remote host will suspend pool after failed writes */ import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * MMP_IMPORT_SAFETY_FACTOR / 100; zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " "mmp_fails=%llu ub_mmp mmp_interval=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)MMP_FAIL_INT(ub), (u_longlong_t)MMP_INTERVAL(ub), (u_longlong_t)import_intervals); } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) == 0) { /* MMP on remote host will never suspend pool */ import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " "mmp_interval=%llu ub_mmp_delay=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)MMP_INTERVAL(ub), (u_longlong_t)ub->ub_mmp_delay, (u_longlong_t)import_intervals); } else if (MMP_VALID(ub)) { /* * zfs-0.7 compatibility case */ import_delay = MAX(import_delay, (multihost_interval + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " "import_intervals=%llu leaves=%u", (u_longlong_t)import_delay, (u_longlong_t)ub->ub_mmp_delay, (u_longlong_t)import_intervals, vdev_count_leaves(spa)); } else { /* Using local tunings is the only reasonable option */ zfs_dbgmsg("pool last imported on non-MMP aware " "host using import_delay=%llu multihost_interval=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)multihost_interval, (u_longlong_t)import_intervals); } return (import_delay); } /* * Remote host activity check. * * error results: * 0 - no activity detected * EREMOTEIO - remote activity detected * EINTR - user canceled the operation */ static int spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, boolean_t importing) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; uint64_t mmp_config = ub->ub_mmp_config; uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; uint64_t import_delay; hrtime_t import_expire, now; nvlist_t *mmp_label = NULL; vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; kmutex_t mtx; int error = 0; cv_init(&cv, NULL, CV_DEFAULT, NULL); mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); mutex_enter(&mtx); /* * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed * during the earlier tryimport. If the txg recorded there is 0 then * the pool is known to be active on another host. * * Otherwise, the pool might be in use on another host. Check for * changes in the uberblocks on disk if necessary. */ if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { vdev_uberblock_load(rvd, ub, &mmp_label); error = SET_ERROR(EREMOTEIO); goto out; } } import_delay = spa_activity_check_duration(spa, ub); /* Add a small random factor in case of simultaneous imports (0-25%) */ import_delay += import_delay * random_in_range(250) / 1000; import_expire = gethrtime() + import_delay; if (importing) { spa_import_progress_set_notes(spa, "Checking MMP activity, " "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); } int iterations = 0; while ((now = gethrtime()) < import_expire) { if (importing && iterations++ % 30 == 0) { spa_import_progress_set_notes(spa, "Checking MMP " "activity, %llu ms remaining", (u_longlong_t)NSEC2MSEC(import_expire - now)); } if (importing) { (void) spa_import_progress_set_mmp_check(spa_guid(spa), NSEC2SEC(import_expire - gethrtime())); } vdev_uberblock_load(rvd, ub, &mmp_label); if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { zfs_dbgmsg("multihost activity detected " "txg %llu ub_txg %llu " "timestamp %llu ub_timestamp %llu " "mmp_config %#llx ub_mmp_config %#llx", (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, (u_longlong_t)timestamp, (u_longlong_t)ub->ub_timestamp, (u_longlong_t)mmp_config, (u_longlong_t)ub->ub_mmp_config); error = SET_ERROR(EREMOTEIO); break; } if (mmp_label) { nvlist_free(mmp_label); mmp_label = NULL; } error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); if (error != -1) { error = SET_ERROR(EINTR); break; } error = 0; } out: mutex_exit(&mtx); mutex_destroy(&mtx); cv_destroy(&cv); /* * If the pool is determined to be active store the status in the * spa->spa_load_info nvlist. If the remote hostname or hostid are * available from configuration read from disk store them as well. * This allows 'zpool import' to generate a more useful message. * * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool */ if (error == EREMOTEIO) { const char *hostname = ""; uint64_t hostid = 0; if (mmp_label) { if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { hostname = fnvlist_lookup_string(mmp_label, ZPOOL_CONFIG_HOSTNAME); fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTNAME, hostname); } if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { hostid = fnvlist_lookup_uint64(mmp_label, ZPOOL_CONFIG_HOSTID); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTID, hostid); } } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, 0); error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); } if (mmp_label) nvlist_free(mmp_label); return (error); } /* * Called from zfs_ioc_clear for a pool that was suspended * after failing mmp write checks. */ boolean_t spa_mmp_remote_host_activity(spa_t *spa) { ASSERT(spa_multihost(spa) && spa_suspended(spa)); nvlist_t *best_label; uberblock_t best_ub; /* * Locate the best uberblock on disk */ vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); if (best_label) { /* * confirm that the best hostid matches our hostid */ if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && spa_get_hostid(spa) != fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { nvlist_free(best_label); return (B_TRUE); } nvlist_free(best_label); } else { return (B_TRUE); } if (!MMP_VALID(&best_ub) || !MMP_FAIL_INT_VALID(&best_ub) || MMP_FAIL_INT(&best_ub) == 0) { return (B_TRUE); } if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { zfs_dbgmsg("txg mismatch detected during pool clear " "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", (u_longlong_t)spa->spa_uberblock.ub_txg, (u_longlong_t)best_ub.ub_txg, (u_longlong_t)spa->spa_uberblock.ub_timestamp, (u_longlong_t)best_ub.ub_timestamp); return (B_TRUE); } /* * Perform an activity check looking for any remote writer */ return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, B_FALSE) != 0); } static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { uint64_t hostid; const char *hostname; uint64_t myhostid = 0; if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { hostname = fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME); myhostid = zone_get_hostid(NULL); if (hostid != 0 && myhostid != 0 && hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%llx). " "See: https://openzfs.github.io/openzfs-docs/msg/" "ZFS-8000-EY", spa_name(spa), hostname, (u_longlong_t)hostid); spa_load_failed(spa, "hostid verification failed: pool " "last accessed by host: %s (hostid: 0x%llx)", hostname, (u_longlong_t)hostid); return (SET_ERROR(EBADF)); } } return (0); } static int spa_ld_parse_config(spa_t *spa, spa_import_type_t type) { int error = 0; nvlist_t *nvtree, *nvl, *config = spa->spa_config; int parse; vdev_t *rvd; uint64_t pool_guid; const char *comment; const char *compatibility; /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_POOL_GUID); return (SET_ERROR(EINVAL)); } /* * If we are doing an import, ensure that the pool is not already * imported by checking if its pool guid already exists in the * spa namespace. * * The only case that we allow an already imported pool to be * imported again, is when the pool is checkpointed and we want to * look at its checkpointed state from userland tools like zdb. */ #ifdef _KERNEL if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { #else if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0) && !spa_importing_readonly_checkpoint(spa)) { #endif spa_load_failed(spa, "a pool with guid %llu is already open", (u_longlong_t)pool_guid); return (SET_ERROR(EEXIST)); } spa->spa_config_guid = pool_guid; nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); ASSERT(spa->spa_comment == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); ASSERT(spa->spa_compatibility == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, &compatibility) == 0) spa->spa_compatibility = spa_strdup(compatibility); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) spa->spa_config_splitting = fnvlist_dup(nvl); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_VDEV_TREE); return (SET_ERROR(EINVAL)); } /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "unable to parse config [error=%d]", error); return (error); } ASSERT(spa->spa_root_vdev == rvd); ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } return (0); } /* * Recursively open all vdevs in the vdev tree. This function is called twice: * first with the untrusted config, then with the trusted config. */ static int spa_ld_open_vdevs(spa_t *spa) { int error = 0; /* * spa_missing_tvds_allowed defines how many top-level vdevs can be * missing/unopenable for the root vdev to be still considered openable. */ if (spa->spa_trust_config) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; } else { spa->spa_missing_tvds_allowed = 0; } spa->spa_missing_tvds_allowed = MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "vdev tree has %lld missing top-level " "vdevs.", (u_longlong_t)spa->spa_missing_tvds); if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { /* * Although theoretically we could allow users to open * incomplete pools in RW mode, we'd need to add a lot * of extra logic (e.g. adjust pool space to account * for missing vdevs). * This limitation also prevents users from accidentally * opening the pool in RW mode during data recovery and * damaging it further. */ spa_load_note(spa, "pools with missing top-level " "vdevs can only be opened in read-only mode."); error = SET_ERROR(ENXIO); } else { spa_load_note(spa, "current settings allow for maximum " "%lld missing top-level vdevs at this stage.", (u_longlong_t)spa->spa_missing_tvds_allowed); } } if (error != 0) { spa_load_failed(spa, "unable to open vdev tree [error=%d]", error); } if (spa->spa_missing_tvds != 0 || error != 0) vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); return (error); } /* * We need to validate the vdev labels against the configuration that * we have in hand. This function is called twice: first with an untrusted * config, then with a trusted config. The validation is more strict when the * config is trusted. */ static int spa_ld_validate_vdevs(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "vdev_validate failed [error=%d]", error); return (error); } if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { spa_load_failed(spa, "cannot open vdev tree after invalidating " "some vdevs"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } return (0); } static void spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) { spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; } static int spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) { vdev_t *rvd = spa->spa_root_vdev; nvlist_t *label; uberblock_t *ub = &spa->spa_uberblock; boolean_t activity_check = B_FALSE; /* * If we are opening the checkpointed state of the pool by * rewinding to it, at this point we will have written the * checkpointed uberblock to the vdev labels, so searching * the labels will find the right uberblock. However, if * we are opening the checkpointed state read-only, we have * not modified the labels. Therefore, we must ignore the * labels and continue using the spa_uberblock that was set * by spa_ld_checkpoint_rewind. * * Note that it would be fine to ignore the labels when * rewinding (opening writeable) as well. However, if we * crash just after writing the labels, we will end up * searching the labels. Doing so in the common case means * that this code path gets exercised normally, rather than * just in the edge case. */ if (ub->ub_checkpoint_txg != 0 && spa_importing_readonly_checkpoint(spa)) { spa_ld_select_uberblock_done(spa, ub); return (0); } /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { nvlist_free(label); spa_load_failed(spa, "no valid uberblock found"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (spa->spa_load_max_txg != UINT64_MAX) { (void) spa_import_progress_set_max_txg(spa_guid(spa), (u_longlong_t)spa->spa_load_max_txg); } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); if (ub->ub_raidz_reflow_info != 0) { spa_load_note(spa, "uberblock raidz_reflow_info: " "state=%u offset=%llu", (int)RRSS_GET_STATE(ub), (u_longlong_t)RRSS_GET_OFFSET(ub)); } /* * For pools which have the multihost property on determine if the * pool is truly inactive and can be safely imported. Prevent * hosts which don't have a hostid set from importing the pool. */ activity_check = spa_activity_check_required(spa, ub, label, spa->spa_config); if (activity_check) { if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && spa_get_hostid(spa) == 0) { nvlist_free(label); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } int error = spa_activity_check(spa, ub, spa->spa_config, B_TRUE); if (error) { nvlist_free(label); return (error); } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); fnvlist_add_uint16(spa->spa_load_info, ZPOOL_CONFIG_MMP_SEQ, (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); } /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); spa_load_failed(spa, "version %llu is not supported", (u_longlong_t)ub->ub_version); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *features; /* * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ if (label == NULL) { spa_load_failed(spa, "label config unavailable"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { nvlist_free(label); spa_load_failed(spa, "invalid label: '%s' missing", ZPOOL_CONFIG_FEATURES_FOR_READ); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * Update our in-core representation with the definitive values * from the label. */ nvlist_free(spa->spa_label_features); spa->spa_label_features = fnvlist_dup(features); } nvlist_free(label); /* * Look through entries in the label nvlist's features_for_read. If * there is a feature listed there which we don't understand then we * cannot open a pool. */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; unsup_feat = fnvlist_alloc(); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { fnvlist_add_string(unsup_feat, nvpair_name(nvp), ""); } } if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); nvlist_free(unsup_feat); spa_load_failed(spa, "some features are unsupported"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } nvlist_free(unsup_feat); } if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, spa->spa_config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa_ld_select_uberblock_done(spa, ub); return (0); } static int spa_ld_open_rootbp(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error != 0) { spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; return (0); } static int spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t reloading) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv, *mos_config, *policy; int error = 0, copy_error; uint64_t healthy_tvds, healthy_tvds_mos; uint64_t mos_config_txg; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * If we're assembling a pool from a split, the config provided is * already trusted so there is nothing to do. */ if (type == SPA_IMPORT_ASSEMBLE) return (0); healthy_tvds = spa_healthy_core_tvds(spa); if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * If we are doing an open, pool owner wasn't verified yet, thus do * the verification here. */ if (spa->spa_load_state == SPA_LOAD_OPEN) { error = spa_verify_host(spa, mos_config); if (error != 0) { nvlist_free(mos_config); return (error); } } nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Build a new vdev tree from the trusted config */ error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); if (error != 0) { nvlist_free(mos_config); spa_config_exit(spa, SCL_ALL, FTAG); spa_load_failed(spa, "spa_config_parse failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Vdev paths in the MOS may be obsolete. If the untrusted config was * obtained by scanning /dev/dsk, then it will have the right vdev * paths. We update the trusted MOS config with this information. * We first try to copy the paths with vdev_copy_path_strict, which * succeeds only when both configs have exactly the same vdev tree. * If that fails, we fall back to a more flexible method that has a * best effort policy. */ copy_error = vdev_copy_path_strict(rvd, mrvd); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "provided vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); spa_load_note(spa, "MOS vdev tree:"); vdev_dbgmsg_print_tree(mrvd, 2); } if (copy_error != 0) { spa_load_note(spa, "vdev_copy_path_strict failed, falling " "back to vdev_copy_path_relaxed"); vdev_copy_path_relaxed(rvd, mrvd); } vdev_close(rvd); vdev_free(rvd); spa->spa_root_vdev = mrvd; rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); /* * If 'zpool import' used a cached config, then the on-disk hostid and * hostname may be different to the cached config in ways that should * prevent import. Userspace can't discover this without a scan, but * we know, so we add these values to LOAD_INFO so the caller can know * the difference. * * Note that we have to do this before the config is regenerated, * because the new config will have the hostid and hostname for this * host, in readiness for import. */ if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to * pass settings on how to load the pool and is not stored in the MOS. * We copy it over to our new, trusted config. */ mos_config_txg = fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_POOL_TXG); nvlist_free(mos_config); mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, &policy) == 0) fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); spa_config_set(spa, mos_config); spa->spa_config_source = SPA_CONFIG_SRC_MOS; /* * Now that we got the config from the MOS, we should be more strict * in checking blkptrs and can make assumptions about the consistency * of the vdev tree. spa_trust_config must be set to true before opening * vdevs in order for them to be writeable. */ spa->spa_trust_config = B_TRUE; /* * Open and validate the new vdev tree */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "final vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); } if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { /* * Sanity check to make sure that we are indeed loading the * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds * in the config provided and they happened to be the only ones * to have the latest uberblock, we could involuntarily perform * an extreme rewind. */ healthy_tvds_mos = spa_healthy_core_tvds(spa); if (healthy_tvds_mos - healthy_tvds >= SPA_SYNC_MIN_VDEVS) { spa_load_note(spa, "config provided misses too many " "top-level vdevs compared to MOS (%lld vs %lld). ", (u_longlong_t)healthy_tvds, (u_longlong_t)healthy_tvds_mos); spa_load_note(spa, "vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); if (reloading) { spa_load_failed(spa, "config was already " "provided from MOS. Aborting."); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_load_note(spa, "spa must be reloaded using MOS " "config"); return (SET_ERROR(EAGAIN)); } } error = spa_check_for_missing_logs(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { spa_load_failed(spa, "uberblock guid sum doesn't match MOS " "guid sum (%llu != %llu)", (u_longlong_t)spa->spa_uberblock.ub_guid_sum, (u_longlong_t)rvd->vdev_guid_sum); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } return (0); } static int spa_ld_open_indirect_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * Everything that we read before spa_remove_init() must be stored * on concreted vdevs. Therefore we do this as early as possible. */ error = spa_remove_init(spa); if (error != 0) { spa_load_failed(spa, "spa_remove_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Retrieve information needed to condense indirect vdev mappings. */ error = spa_condense_init(spa); if (error != 0) { spa_load_failed(spa, "spa_condense_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } return (0); } static int spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, &spa->spa_feat_for_write_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, &spa->spa_feat_desc_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } enabled_feat = fnvlist_alloc(); unsup_feat = fnvlist_alloc(); if (!spa_features_check(spa, B_FALSE, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || spa->spa_load_state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { *missing_feat_writep = B_TRUE; } } fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } fnvlist_free(enabled_feat); fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, ZPOOL_CONFIG_CAN_RDONLY); } /* * If the state is SPA_LOAD_TRYIMPORT, our objective is * twofold: to determine whether the pool is available for * import in read-write mode and (if it is not) whether the * pool is available for import in read-only mode. If the pool * is available for import in read-write mode, it is displayed * as available in userland; if it is not available for import * in read-only mode, it is displayed as unavailable in * userland. If the pool is available for import in read-only * mode but not read-write mode, it is displayed as unavailable * in userland with a special note that the pool is actually * available for open in read-only mode. * * As a result, if the state is SPA_LOAD_TRYIMPORT and we are * missing a feature for write, we must first determine whether * the pool can be opened read-only before returning to * userland in order to know whether to display the * abovementioned note. */ if (missing_feat_read || (*missing_feat_writep && spa_writeable(spa))) { spa_load_failed(spa, "pool uses unsupported features"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Load refcounts for ZFS features from disk into an in-memory * cache during SPA initialization. */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { uint64_t refcount; error = feature_get_refcount_from_disk(spa, &spa_feature_table[i], &refcount); if (error == 0) { spa->spa_feat_refcount_cache[i] = refcount; } else if (error == ENOTSUP) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { spa_load_failed(spa, "error getting refcount " "for feature %s [error=%d]", spa_feature_table[i].fi_guid, error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } } } if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Encryption was added before bookmark_v2, even though bookmark_v2 * is now a dependency. If this pool has encryption enabled without * bookmark_v2, trigger an errata message. */ if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; } return (0); } static int spa_ld_load_special_directories(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; if (error != 0) { spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_get_props(spa_t *spa) { int error = 0; uint64_t obj; vdev_t *rvd = spa->spa_root_vdev; /* Grab the checksum salt from the MOS. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes); if (error == ENOENT) { /* Generate a new salt for subsequent use */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); } else if (error != 0) { spa_load_failed(spa, "unable to retrieve checksum salt from " "MOS [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) { spa_load_failed(spa, "error opening deferred-frees bpobj " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* Load the last scrubbed txg. */ error = spa_dir_prop(spa, DMU_POOL_LAST_SCRUBBED_TXG, &spa->spa_scrubbed_last_txg, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the livelist deletion field. If a livelist is queued for * deletion, indicate that in the spa */ error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, &spa->spa_livelists_to_delete, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the per-vdev ZAP map. If we have an older pool, this will not * be present; in this case, defer its creation to a later time to * avoid dirtying the MOS this early / out of sync context. See * spa_sync_config_object. */ /* The sentinel is only available in the MOS config. */ nvlist_t *mos_config; if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, &spa->spa_all_vdev_zaps, B_FALSE); if (error == ENOENT) { VERIFY(!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); spa->spa_avz_action = AVZ_ACTION_INITIALIZE; ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } else if (error != 0) { nvlist_free(mos_config); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { /* * An older version of ZFS overwrote the sentinel value, so * we have orphaned per-vdev ZAPs in the MOS. Defer their * destruction to later; see spa_sync_config_object. */ spa->spa_avz_action = AVZ_ACTION_DESTROY; /* * We're assuming that no vdevs have had their ZAPs created * before this. Better be sure of it. */ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } nvlist_free(mos_config); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, B_FALSE); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace = 0; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA, &spa->spa_dedup_table_quota); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); spa->spa_autoreplace = (autoreplace != 0); } /* * If we are importing a pool with missing top-level vdevs, * we enforce that the pool doesn't panic or get suspended on * error since the likelihood of missing data is extremely high. */ if (spa->spa_missing_tvds > 0 && spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_load_note(spa, "forcing failmode to 'continue' " "as some top level vdevs are missing"); spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; } return (0); } static int spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) { spa_load_failed(spa, "error loading spares nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) { spa_load_failed(spa, "error loading l2cache nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } return (0); } static int spa_ld_load_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If the 'multihost' property is set, then never allow a pool to * be imported when the system hostid is zero. The exception to * this rule is zdb which is always allowed to access pools. */ if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (spa->spa_load_state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. */ error = vdev_load(rvd); if (error != 0) { spa_load_failed(spa, "vdev_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } error = spa_ld_log_spacemaps(spa); if (error != 0) { spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); } static int spa_ld_load_dedup_tables(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = ddt_load(spa); if (error != 0) { spa_load_failed(spa, "ddt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_load_brt(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = brt_load(spa); if (error != 0) { spa_load_failed(spa, "brt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) { vdev_t *rvd = spa->spa_root_vdev; if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { boolean_t missing = spa_check_logs(spa); if (missing) { if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "spa_check_logs failed " "so dropping the logs"); } else { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; spa_load_failed(spa, "spa_check_logs failed"); return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } } return (0); } static int spa_ld_verify_pool_data(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { error = spa_load_verify(spa); if (error != 0) { spa_load_failed(spa, "spa_load_verify failed " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } } return (0); } static void spa_ld_claim_log_blocks(spa_t *spa) { dmu_tx_t *tx; dsl_pool_t *dp = spa_get_dsl(spa); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); } static void spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, boolean_t update_config_cache) { vdev_t *rvd = spa->spa_root_vdev; int need_update = B_FALSE; /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (update_config_cache || config_cache_txg != spa->spa_config_txg || spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asynchronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } static void spa_ld_prepare_for_reload(spa_t *spa) { spa_mode_t mode = spa->spa_mode; int async_suspended = spa->spa_async_suspended; spa_unload(spa); spa_deactivate(spa); spa_activate(spa, mode); /* * We save the value of spa_async_suspended as it gets reset to 0 by * spa_unload(). We want to restore it back to the original value before * returning as we might be calling spa_async_resume() later. */ spa->spa_async_suspended = async_suspended; } static int spa_ld_read_checkpoint_txg(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT0(spa->spa_checkpoint_txg); ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_load_thread == curthread); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT) return (0); if (error != 0) return (error); ASSERT3U(checkpoint.ub_txg, !=, 0); ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); ASSERT3U(checkpoint.ub_timestamp, !=, 0); spa->spa_checkpoint_txg = checkpoint.ub_txg; spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; return (0); } static int spa_ld_mos_init(spa_t *spa, spa_import_type_t type) { int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); /* * Never trust the config that is provided unless we are assembling * a pool following a split. * This means don't trust blkptrs and the vdev tree in general. This * also effectively puts the spa in read-only mode since * spa_writeable() checks for spa_trust_config to be true. * We will later load a trusted config from the MOS. */ if (type != SPA_IMPORT_ASSEMBLE) spa->spa_trust_config = B_FALSE; /* * Parse the config provided to create a vdev tree. */ error = spa_ld_parse_config(spa, type); if (error != 0) return (error); spa_import_progress_add(spa); /* * Now that we have the vdev tree, try to open each vdev. This involves * opening the underlying physical device, retrieving its geometry and * probing the vdev with a dummy I/O. The state of each vdev will be set * based on the success of those operations. After this we'll be ready * to read from the vdevs. */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); /* * Read the label of each vdev and make sure that the GUIDs stored * there match the GUIDs in the config provided. * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); } /* * Read all vdev labels to find the best uberblock (i.e. latest, * unless spa_load_max_txg is set) and store it in spa_uberblock. We * get the list of features required to read blkptrs in the MOS from * the vdev label with the best uberblock and verify that our version * of zfs supports them all. */ error = spa_ld_select_uberblock(spa, type); if (error != 0) return (error); /* * Pass that uberblock to the dsl_pool layer which will open the root * blkptr. This blkptr points to the latest version of the MOS and will * allow us to read its contents. */ error = spa_ld_open_rootbp(spa); if (error != 0) return (error); return (0); } static int spa_ld_checkpoint_rewind(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error != 0) { spa_load_failed(spa, "unable to retrieve checkpointed " "uberblock from the MOS config [error=%d]", error); if (error == ENOENT) error = ZFS_ERR_NO_CHECKPOINT; return (error); } ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); /* * We need to update the txg and timestamp of the checkpointed * uberblock to be higher than the latest one. This ensures that * the checkpointed uberblock is selected if we were to close and * reopen the pool right after we've written it in the vdev labels. * (also see block comment in vdev_uberblock_compare) */ checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; checkpoint.ub_timestamp = gethrestime_sec(); /* * Set current uberblock to be the checkpointed uberblock. */ spa->spa_uberblock = checkpoint; /* * If we are doing a normal rewind, then the pool is open for * writing and we sync the "updated" checkpointed uberblock to * disk. Once this is done, we've basically rewound the whole * pool and there is no way back. * * There are cases when we don't want to attempt and sync the * checkpointed uberblock to disk because we are opening a * pool as read-only. Specifically, verifying the checkpointed * state with zdb, and importing the checkpointed state to get * a "preview" of its content. */ if (spa_writeable(spa)) { vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = random_in_range(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "failed to write checkpointed " "uberblock to the vdev labels [error=%d]", error); return (error); } } return (0); } static int spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t *update_config_cache) { int error; /* * Parse the config for pool, open and validate vdevs, * select an uberblock, and use that uberblock to open * the MOS. */ error = spa_ld_mos_init(spa, type); if (error != 0) return (error); /* * Retrieve the trusted config stored in the MOS and use it to create * a new, exact version of the vdev tree, then reopen all vdevs. */ error = spa_ld_trusted_config(spa, type, B_FALSE); if (error == EAGAIN) { if (update_config_cache != NULL) *update_config_cache = B_TRUE; /* * Redo the loading process with the trusted config if it is * too different from the untrusted config. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "RELOADING"); error = spa_ld_mos_init(spa, type); if (error != 0) return (error); error = spa_ld_trusted_config(spa, type, B_TRUE); if (error != 0) return (error); } else if (error != 0) { return (error); } return (0); } /* * Load an existing storage pool, using the config provided. This config * describes which vdevs are part of the pool and is later validated against * partial configs present in each vdev's label and an entire copy of the * config stored in the MOS. */ static int spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) { int error = 0; boolean_t missing_feat_write = B_FALSE; boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; hrtime_t load_start = gethrtime(); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); spa_load_note(spa, "LOADING"); error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); if (error != 0) return (error); /* * If we are rewinding to the checkpoint then we need to repeat * everything we've done so far in this function but this time * selecting the checkpointed uberblock and using that to open * the MOS. */ if (checkpoint_rewind) { /* * If we are rewinding to the checkpoint update config cache * anyway. */ update_config_cache = B_TRUE; /* * Extract the checkpointed uberblock from the current MOS * and use this as the pool's uberblock from now on. If the * pool is imported as writeable we also write the checkpoint * uberblock to the labels, making the rewind permanent. */ error = spa_ld_checkpoint_rewind(spa); if (error != 0) return (error); /* * Redo the loading process again with the * checkpointed uberblock. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "LOADING checkpointed uberblock"); error = spa_ld_mos_with_trusted_config(spa, type, NULL); if (error != 0) return (error); } /* * Drop the namespace lock for the rest of the function. */ spa->spa_load_thread = curthread; mutex_exit(&spa_namespace_lock); /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ spa_import_progress_set_notes(spa, "Loading checkpoint txg"); error = spa_ld_read_checkpoint_txg(spa); if (error != 0) goto fail; /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed * from the pool and their contents were re-mapped to other vdevs. Note * that everything that we read before this step must have been * rewritten on concrete vdevs after the last device removal was * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) goto fail; /* * Retrieve the full list of active features from the MOS and check if * they are all supported. */ spa_import_progress_set_notes(spa, "Checking feature flags"); error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) goto fail; /* * Load several special directories from the MOS needed by the dsl_pool * layer. */ spa_import_progress_set_notes(spa, "Loading special MOS directories"); error = spa_ld_load_special_directories(spa); if (error != 0) goto fail; /* * Retrieve pool properties from the MOS. */ spa_import_progress_set_notes(spa, "Loading properties"); error = spa_ld_get_props(spa); if (error != 0) goto fail; /* * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ spa_import_progress_set_notes(spa, "Loading AUX vdevs"); error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) goto fail; /* * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ spa_import_progress_set_notes(spa, "Loading vdev metadata"); error = spa_ld_load_vdev_metadata(spa); if (error != 0) goto fail; spa_import_progress_set_notes(spa, "Loading dedup tables"); error = spa_ld_load_dedup_tables(spa); if (error != 0) goto fail; spa_import_progress_set_notes(spa, "Loading BRT"); error = spa_ld_load_brt(spa); if (error != 0) goto fail; /* * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ spa_import_progress_set_notes(spa, "Verifying Log Devices"); error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) goto fail; if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, ENOTSUP); goto fail; } /* * Traverse the last txgs to make sure the pool was left off in a safe * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ spa_import_progress_set_notes(spa, "Verifying pool data"); error = spa_ld_verify_pool_data(spa); if (error != 0) goto fail; /* * Calculate the deflated space for the pool. This must be done before * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ spa_import_progress_set_notes(spa, "Calculating deflated space"); spa_update_dspace(spa); /* * We have now retrieved all the information we needed to open the * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ spa_import_progress_set_notes(spa, "Starting import"); if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* * Before we do any zio_write's, complete the raidz expansion * scratch space copying, if necessary. */ if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) vdev_raidz_reflow_copy_scratch(spa); /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. */ if (checkpoint_rewind) { spa_history_log_internal(spa, "checkpoint rewind", NULL, "rewound state to txg=%llu", (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); /* * Traverse the ZIL and claim all blocks. */ spa_ld_claim_log_blocks(spa); /* * Kick-off the syncing thread. */ spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); mmp_thread_start(spa); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by ZIL traversal operations * performed above. */ spa_import_progress_set_notes(spa, "Syncing ZIL claims"); txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * Check if we need to request an update of the config. On the * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ spa_import_progress_set_notes(spa, "Updating configs"); spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); /* * Check if a rebuild was in progress and if so resume it. * Then check all DTLs to see if anything needs resilvering. * The resilver will be deferred if a rebuild was started. */ spa_import_progress_set_notes(spa, "Starting resilvers"); if (vdev_rebuild_active(spa->spa_root_vdev)) { vdev_rebuild_restart(spa); } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER); } /* * Log the fact that we booted up (so that we can detect if * we rebooted in the middle of an operation). */ spa_history_log_version(spa, "open", NULL); spa_import_progress_set_notes(spa, "Restarting device removals"); spa_restart_removal(spa); spa_spawn_aux_threads(spa); /* * Delete any inconsistent datasets. * * Note: * Since we may be issuing deletes for clones here, * we make sure to do so after we've spawned all the * auxiliary threads above (from which the livelist * deletion zthr is part of). */ spa_import_progress_set_notes(spa, "Cleaning up inconsistent objsets"); (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ spa_import_progress_set_notes(spa, "Cleaning up temporary userrefs"); dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_import_progress_set_notes(spa, "Restarting initialize"); vdev_initialize_restart(spa->spa_root_vdev); spa_import_progress_set_notes(spa, "Restarting TRIM"); vdev_trim_restart(spa->spa_root_vdev); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); spa_import_progress_set_notes(spa, "Finished importing"); } zio_handle_import_delay(spa, gethrtime() - load_start); spa_import_progress_remove(spa_guid(spa)); spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_load_note(spa, "LOADED"); fail: mutex_enter(&spa_namespace_lock); spa->spa_load_thread = NULL; cv_broadcast(&spa_namespace_cv); return (error); } static int spa_load_retry(spa_t *spa, spa_load_state_t state) { spa_mode_t mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; spa_activate(spa, mode); spa_async_suspend(spa); spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", (u_longlong_t)spa->spa_load_max_txg); return (spa_load(spa, state, SPA_IMPORT_EXISTING)); } /* * If spa_load() fails this function will try loading prior txg's. If * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this * function will not rewind the pool and will return the same error as * spa_load(). */ static int spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; if (max_request != UINT64_MAX) spa->spa_extreme_rewind = B_TRUE; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); if (load_error == ZFS_ERR_NO_CHECKPOINT) { /* * When attempting checkpoint-rewind on a pool with no * checkpoint, we should not attempt to load uberblocks * from previous txgs when spa_load fails. */ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (state == SPA_LOAD_RECOVER) { /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * If we aren't rolling back save the load info from our first * import attempt so that we can restore it after attempting * to rewind. */ loadinfo = spa->spa_load_info; spa->spa_load_info = fnvlist_alloc(); } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); else nvlist_free(config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); spa_import_progress_remove(spa_guid(spa)); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, spa->spa_load_info); /* Restore the initial load info */ fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; spa_import_progress_remove(spa_guid(spa)); return (load_error); } } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, const void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; int firstopen = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; firstopen = B_TRUE; zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; zfs_dbgmsg("spa_open_common: opening %s", pool); error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { *config = fnvlist_dup(spa->spa_config); fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER && config != NULL) { fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); } if (firstopen) zvol_create_minors_recursive(spa_name(spa)); *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, const void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, const void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares)); if (nspares != 0) { fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares)); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { guid = fnvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID); VERIFY0(nvlist_lookup_uint64_array(spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } else { vs->vs_state = spa->spa_spares.sav_vdevs[i]->vdev_state; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); if (nl2cache != 0) { fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY0(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); vdev_get_stats(vd, vs); vdev_config_generate_stats(vd, l2cache[i]); } } } static void spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) { zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_read_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { ASSERT(za->za_integer_length == sizeof (uint64_t) && za->za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za->za_name, za->za_first_integer)); } zap_cursor_fini(&zc); } if (spa->spa_feat_for_write_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_write_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { ASSERT(za->za_integer_length == sizeof (uint64_t) && za->za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za->za_name, za->za_first_integer)); } zap_cursor_fini(&zc); } zap_attribute_free(za); } static void spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) { int i; for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t feature = spa_feature_table[i]; uint64_t refcount; if (feature_get_refcount(spa, &feature, &refcount) != 0) continue; VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); } } /* * Store a list of pool features and their reference counts in the * config. * * The first time this is called on a spa, allocate a new nvlist, fetch * the pool features and reference counts from disk, then save the list * in the spa. In subsequent calls on the same spa use the saved nvlist * and refresh its values from the cached reference counts. This * ensures we don't block here on I/O on a suspended pool so 'zpool * clear' can resume the pool. */ static void spa_add_feature_stats(spa_t *spa, nvlist_t *config) { nvlist_t *features; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); mutex_enter(&spa->spa_feat_stats_lock); features = spa->spa_feat_stats; if (features != NULL) { spa_feature_stats_from_cache(spa, features); } else { VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); spa->spa_feat_stats = features; spa_feature_stats_from_disk(spa, features); } VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, features)); mutex_exit(&spa->spa_feat_stats_lock); } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; fnvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); fnvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_approx_errlog_size(spa)); if (spa_suspended(spa)) { fnvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode); fnvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED_REASON, spa->spa_suspended); } spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); spa_add_feature_stats(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (SET_ERROR(EINVAL)); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (SET_ERROR(ENOTSUP)); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = SET_ERROR(EINVAL); goto out; } vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatenating with the * current dev list. */ VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs)); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) newdevs[i] = fnvlist_dup(olddevs[i]); for (i = 0; i < ndevs; i++) newdevs[i + oldndevs] = fnvlist_dup(devs[i]); fnvlist_remove(sav->sav_config, config); fnvlist_add_nvlist_array(sav->sav_config, config, (const nvlist_t * const *)newdevs, ndevs + oldndevs); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ sav->sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(sav->sav_config, config, (const nvlist_t * const *)devs, ndevs); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); } } /* * Verify encryption parameters for spa creation. If we are encrypting, we must * have the encryption feature flag enabled. */ static int spa_create_check_encryption_params(dsl_crypto_params_t *dcp, boolean_t has_encryption) { if (dcp->cp_crypt != ZIO_CRYPT_OFF && dcp->cp_crypt != ZIO_CRYPT_INHERIT && !has_encryption) return (SET_ERROR(ENOTSUP)); return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, dsl_crypto_params_t *dcp) { spa_t *spa; const char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj, ndraid = 0; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; spa_feature_t feat; const char *feat_name; const char *poolname; nvlist_t *nvl; if (props == NULL || nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) poolname = (char *)pool; /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(poolname) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Allocate a new spa_t structure. */ nvl = fnvlist_alloc(); fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(poolname, nvl, altroot); fnvlist_free(nvl); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Temporary pool names should never be written to disk. */ if (poolname != pool) spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; has_features = B_FALSE; has_encryption = B_FALSE; has_allocclass = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) { has_features = B_TRUE; feat_name = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(feat_name, &feat)); if (feat == SPA_FEATURE_ENCRYPTION) has_encryption = B_TRUE; if (feat == SPA_FEATURE_ALLOCATION_CLASSES) has_allocclass = B_TRUE; } } /* verify encryption params, if they were provided */ if (dcp != NULL) { error = spa_create_check_encryption_params(dcp, has_encryption); if (error != 0) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } } if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (ENOTSUP); } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; } ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = SET_ERROR(EINVAL); if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point */ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_metaslab_set_size(vd); vdev_expand(vd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { spa->spa_spares.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP)); fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). */ ddt_create(spa); /* * Create BRT table and BRT table object. */ brt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) spa_history_create_obj(spa, tx); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); spa_history_log_version(spa, "create", tx); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Generate some random noise for salted checksums to operate on. */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); spa->spa_dedup_table_quota = zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); } for (int i = 0; i < ndraid; i++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(dp); mmp_thread_start(spa); txg_wait_synced(dp, txg); spa_spawn_aux_threads(spa); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; spa_import_os(spa); mutex_exit(&spa_namespace_lock); return (0); } /* * Import a non-root pool into the system. */ int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; const char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_load_policy_t policy; spa_mode_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = SPA_MODE_READ; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_load_policy(config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; if (state != SPA_LOAD_RECOVER) { spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; zfs_dbgmsg("spa_import: importing %s", pool); } else { zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); } error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); else spa->spa_spares.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; spa->spa_spares.sav_label_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) fnvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE); else spa->spa_l2cache.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; spa->spa_l2cache.sav_label_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); spa_history_log_version(spa, "import", NULL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); zvol_create_minors_recursive(pool); spa_import_os(spa); return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; const char *poolname, *cachefile; spa_t *spa; uint64_t state; int error; zpool_load_policy_t policy; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname); mutex_enter(&spa_namespace_lock); spa = spa_add(name, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); kmem_free(name, MAXPATHLEN); /* * Rewind pool if a max txg was provided. */ zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_txg != UINT64_MAX) { spa->spa_load_max_txg = policy.zlp_txg; spa->spa_extreme_rewind = B_TRUE; zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", poolname, (longlong_t)policy.zlp_txg); } else { zfs_dbgmsg("spa_tryimport: importing %s", poolname); } if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) == 0) { zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; } else { spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } /* * spa_import() relies on a pool config fetched by spa_try_import() * for spare/cache devices. Import flags are not passed to * spa_tryimport(), which makes it return early due to a missing log * device and missing retrieving the cache device and spare eventually. * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch * the correct configuration regardless of the missing log device. */ spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp); fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname; dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { int error = 0; spa_t *spa; hrtime_t export_start = gethrtime(); if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & SPA_MODE_WRITE)) return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_is_exporting) { /* the pool is being exported by another thread */ mutex_exit(&spa_namespace_lock); return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); } spa->spa_is_exporting = B_TRUE; /* * Put a hold on the pool, drop the namespace lock, stop async tasks * and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); if (spa->spa_zvol_taskq) { zvol_remove_minors(spa, spa_name(spa), B_TRUE); taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); spa->spa_export_thread = curthread; spa_close(spa, FTAG); if (spa->spa_state == POOL_STATE_UNINITIALIZED) { mutex_exit(&spa_namespace_lock); goto export_spa; } /* * The pool will be in core if it's openable, in which case we can * modify its state. Objsets may be open only because they're dirty, * so we have to force it to sync before checking spa_refcnt. */ if (spa->spa_sync_on) { txg_wait_synced(spa->spa_dsl_pool, 0); spa_evicting_os_wait(spa); } /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { error = SET_ERROR(EBUSY); goto fail; } mutex_exit(&spa_namespace_lock); /* * At this point we no longer hold the spa_namespace_lock and * there were no references on the spa. Future spa_lookups will * notice the spa->spa_export_thread and wait until we signal * that we are finshed. */ if (spa->spa_sync_on) { vdev_t *rvd = spa->spa_root_vdev; /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { error = SET_ERROR(EXDEV); mutex_enter(&spa_namespace_lock); goto fail; } /* * We're about to export or destroy this pool. Make sure * we stop all initialization and trim activity here before * we set the spa_final_txg. This will ensure that all * dirty data resulting from the initialization is * committed to disk before we unload the pool. */ vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); l2arc_spa_rebuild_stop(spa); /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; vdev_config_dirty(rvd); spa_config_exit(spa, SCL_ALL, FTAG); } /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some * time flushing as many metaslabs as we can in an attempt to * destroy log space maps and save import time. This has to be * done before we set the spa_final_txg, otherwise * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. * spa_should_flush_logs_on_unload() should be called after * spa_state has been set to the new_state. */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; spa_config_exit(spa, SCL_ALL, FTAG); } } export_spa: spa_export_os(spa); if (new_state == POOL_STATE_DESTROYED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); else if (new_state == POOL_STATE_EXPORTED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) *oldconfig = fnvlist_dup(spa->spa_config); if (new_state == POOL_STATE_EXPORTED) zio_handle_export_delay(spa, gethrtime() - export_start); /* * Take the namespace lock for the actual spa_t removal */ mutex_enter(&spa_namespace_lock); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); } else { /* * If spa_remove() is not called for this spa_t and * there is any possibility that it can be reused, * we make sure to reset the exporting flag. */ spa->spa_is_exporting = B_FALSE; spa->spa_export_thread = NULL; } /* * Wake up any waiters in spa_lookup() */ cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (0); fail: spa->spa_is_exporting = B_FALSE; spa->spa_export_thread = NULL; spa_async_resume(spa); /* * Wake up any waiters in spa_lookup() */ cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (error); } /* * Destroy a storage pool. */ int spa_destroy(const char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(const char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * This is called as a synctask to increment the draid feature flag */ static void spa_draid_feature_incr(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; int draid = (int)(uintptr_t)arg; for (int c = 0; c < draid; c++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); } /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) { uint64_t txg, ndraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) { return (spa_vdev_exit(spa, vd, txg, error)); } /* * The virtual dRAID spares must be added after vdev tree is created * and the vdev guids are generated. The guid of their associated * dRAID is stored in the config and used when opening the spare. */ if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, rvd->vdev_children)) == 0) { if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; } else { return (spa_vdev_exit(spa, vd, txg, error)); } /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect * vdevs, we can not add raidz or dRAID top levels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* Fail if top level vdev is raidz or a dRAID */ if (vdev_get_nparity(tvd) != 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); /* * Need the top level mirror to be * a mirror of leaf vdevs only */ if (tvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < tvd->vdev_children; cid++) { vdev_t *cvd = tvd->vdev_child[cid]; if (!cvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } } } } } if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) { for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, ZFS_ERR_ASHIFT_MISMATCH)); } } } for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = rvd->vdev_children; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We can't increment a feature while holding spa_vdev so we * have to do it in a synctask. */ if (ndraid != 0) { dmu_tx_t *tx; tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, (void *)(uintptr_t)ndraid, tx); dmu_tx_commit(tx); } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); } /* * Given a vdev to be replaced and its parent, check for a possible * "double spare" condition if a vdev is to be replaced by a spare. When this * happens, you can get two spares assigned to one failed vdev. * * To trigger a double spare condition: * * 1. disk1 fails * 2. 1st spare is kicked in for disk1 and it resilvers * 3. Someone replaces disk1 with a new blank disk * 4. New blank disk starts resilvering * 5. While resilvering, new blank disk has IO errors and faults * 6. 2nd spare is kicked in for new blank disk * 7. At this point two spares are kicked in for the original disk1. * * It looks like this: * * NAME STATE READ WRITE CKSUM * tank2 DEGRADED 0 0 0 * draid2:6d:10c:2s-0 DEGRADED 0 0 0 * scsi-0QEMU_QEMU_HARDDISK_d1 ONLINE 0 0 0 * scsi-0QEMU_QEMU_HARDDISK_d2 ONLINE 0 0 0 * scsi-0QEMU_QEMU_HARDDISK_d3 ONLINE 0 0 0 * scsi-0QEMU_QEMU_HARDDISK_d4 ONLINE 0 0 0 * scsi-0QEMU_QEMU_HARDDISK_d5 ONLINE 0 0 0 * scsi-0QEMU_QEMU_HARDDISK_d6 ONLINE 0 0 0 * scsi-0QEMU_QEMU_HARDDISK_d7 ONLINE 0 0 0 * scsi-0QEMU_QEMU_HARDDISK_d8 ONLINE 0 0 0 * scsi-0QEMU_QEMU_HARDDISK_d9 ONLINE 0 0 0 * spare-9 DEGRADED 0 0 0 * replacing-0 DEGRADED 0 93 0 * scsi-0QEMU_QEMU_HARDDISK_d10-part1/old UNAVAIL 0 0 0 * spare-1 DEGRADED 0 0 0 * scsi-0QEMU_QEMU_HARDDISK_d10 REMOVED 0 0 0 * draid2-0-0 ONLINE 0 0 0 * draid2-0-1 ONLINE 0 0 0 * spares * draid2-0-0 INUSE currently in use * draid2-0-1 INUSE currently in use * * ARGS: * * newvd: New spare disk * pvd: Parent vdev_t the spare should attach to * * This function returns B_TRUE if adding the new vdev would create a double * spare condition, B_FALSE otherwise. */ static boolean_t spa_vdev_new_spare_would_cause_double_spares(vdev_t *newvd, vdev_t *pvd) { vdev_t *ppvd; ppvd = pvd->vdev_parent; if (ppvd == NULL) return (B_FALSE); /* * To determine if this configuration would cause a double spare, we * look at the vdev_op of the parent vdev, and of the parent's parent * vdev. We also look at vdev_isspare on the new disk. A double spare * condition looks like this: * * 1. parent of parent's op is a spare or draid spare * 2. parent's op is replacing * 3. new disk is a spare */ if ((ppvd->vdev_ops == &vdev_spare_ops) || (ppvd->vdev_ops == &vdev_draid_spare_ops)) if (pvd->vdev_ops == &vdev_replacing_ops) if (newvd->vdev_isspare) return (B_TRUE); return (B_FALSE); } /* * Attach a device to a vdev specified by its guid. The vdev type can be * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a * single device). When the vdev is a single device, a mirror vdev will be * automatically inserted. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. * * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) * should be performed instead of traditional healing reconstruction. From * an administrators perspective these are both resilver operations. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare = B_FALSE; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (rebuild) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); if (dsl_scan_resilvering(spa_get_dsl(spa)) || dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); } } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_REBUILD_IN_PROGRESS)); } if (spa->spa_vdev_removal != NULL) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_DEVRM_IN_PROGRESS)); } if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; if (raidz) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * Can't expand a raidz while prior expand is in progress. */ if (spa->spa_raidz_expand != NULL) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); } } else if (!oldvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); } if (raidz) pvd = oldvd; else pvd = oldvd->vdev_parent; if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * log, dedup and special vdevs should not be replaced by spares. */ if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } /* * A dRAID spare can only replace a child of its parent dRAID vdev. */ if (newvd->vdev_ops == &vdev_draid_spare_ops && oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (rebuild) { /* * For rebuilds, the top vdev must support reconstruction * using only space maps. This means the only allowable * vdevs types are the root vdev, a mirror, or dRAID. */ tvd = pvd; if (pvd->vdev_top != NULL) tvd = pvd->vdev_top; if (tvd->vdev_ops != &vdev_mirror_ops && tvd->vdev_ops != &vdev_root_ops && tvd->vdev_ops != &vdev_draid_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } } if (!replacing) { /* * For attach, the only allowable parent is a mirror or * the root vdev. A raidz vdev can be attached to, but * you cannot attach to a raidz child. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && !raidz) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (spa_vdev_new_spare_would_cause_double_spares(newvd, pvd)) { vdev_dbgmsg(newvd, "disk would create double spares, ignore."); return (spa_vdev_exit(spa, newrootvd, txg, EEXIST)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) { return (spa_vdev_exit(spa, newrootvd, txg, ZFS_ERR_ASHIFT_MISMATCH)); } /* * RAIDZ-expansion-specific checks. */ if (raidz) { if (vdev_raidz_attach_check(newvd) != 0) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * Fail early if a child is not healthy or being replaced */ for (int i = 0; i < oldvd->vdev_children; i++) { if (vdev_is_dead(oldvd->vdev_child[i]) || !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, newrootvd, txg, ENXIO)); } /* Also fail if reserved boot area is in-use */ if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) != 0) { return (spa_vdev_exit(spa, newrootvd, txg, EADDRINUSE)); } } } if (raidz) { /* * Note: oldvdpath is freed by spa_strfree(), but * kmem_asprintf() is freed by kmem_strfree(), so we have to * move it to a spa_strdup-ed string. */ char *tmp = kmem_asprintf("raidz%u-%u", (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); oldvdpath = spa_strdup(tmp); kmem_strfree(tmp); } else { oldvdpath = spa_strdup(oldvd->vdev_path); } newvdpath = spa_strdup(newvd->vdev_path); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvdpath, newvdpath) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, KM_SLEEP); (void) sprintf(oldvd->vdev_path, "%s/old", newvdpath); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } spa_strfree(oldvdpath); oldvdpath = spa_strdup(oldvd->vdev_path); } /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (!raidz && pvd->vdev_ops != pvops) { pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); } ASSERT(pvd->vdev_top->vdev_parent == rvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(pvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; if (raidz) { /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_wait(tvd); dtl_max_txg = spa_vdev_config_enter(spa); tvd->vdev_rz_expanding = B_TRUE; vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); vdev_config_dirty(tvd); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, dtl_max_txg); dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, newvd, tx); dmu_tx_commit(tx); } else { vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); } newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Schedule the resilver or rebuild to restart in the future. * We do this to ensure that dmu_sync-ed blocks have been * stitched into the respective datasets. */ if (rebuild) { newvd->vdev_rebuild_txg = txg; vdev_rebuild(tvd); } else { newvd->vdev_resilver_txg = txg; if (dsl_scan_resilvering(spa_get_dsl(spa)) && spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { vdev_defer_resilver(newvd); } else { dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg); } } } if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); return (0); } /* * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent * is a replacing or a spare vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; vdev_t *rvd __maybe_unused = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); txg = spa_vdev_detach_enter(spa, guid); vd = spa_lookup_by_guid(spa, guid, B_FALSE); /* * Besides being called directly from the userland through the * ioctl interface, spa_vdev_detach() can be potentially called * at the end of spa_vdev_resilver_done(). * * In the regular case, when we have a checkpoint this shouldn't * happen as we never empty the DTLs of a vdev during the scrub * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() * should never get here when we have a checkpoint. * * That said, even in a case when we checkpoint the pool exactly * as spa_vdev_resilver_done() calls this function everything * should be fine as the resilver will return right away. */ ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a normal spare, then it * implies that the spare should become a real disk, and be removed * from the active spare list for the pool. dRAID spares on the * other hand are coupled to the pool and thus should never be removed * from the spares list. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; if (last_cvd->vdev_isspare && last_cvd->vdev_ops != &vdev_draid_spare_ops) { unspare = B_TRUE; } } /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); spa_notify_waiters(spa); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(spa, "detach", NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } static int spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } mutex_enter(&vd->vdev_initialize_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate an initialize action we check to see * if the vdev_initialize_thread is NULL. We do this instead * of using the vdev_initialize_state since there might be * a previous initialization process which has completed but * the thread is not exited. */ if (cmd_type == POOL_INITIALIZE_START && (vd->vdev_initialize_thread != NULL || vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_SUSPEND && vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_UNINIT && vd->vdev_initialize_thread != NULL) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } switch (cmd_type) { case POOL_INITIALIZE_START: vdev_initialize(vd); break; case POOL_INITIALIZE_CANCEL: vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); break; case POOL_INITIALIZE_SUSPEND: vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); break; case POOL_INITIALIZE_UNINIT: vdev_uninitialize(vd); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_initialize_lock); return (0); } int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping initialization. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the initializing operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all initialize threads to stop. */ vdev_initialize_stop_wait(spa, &vd_list); /* Sync out the initializing state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } static int spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } else if (!vd->vdev_has_trim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } else if (secure && !vd->vdev_has_securetrim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } mutex_enter(&vd->vdev_trim_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate a TRIM action we check to see if the * vdev_trim_thread is NULL. We do this instead of using the * vdev_trim_state since there might be a previous TRIM process * which has completed but the thread is not exited. */ if (cmd_type == POOL_TRIM_START && (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_TRIM_CANCEL && (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_TRIM_SUSPEND && vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } switch (cmd_type) { case POOL_TRIM_START: vdev_trim(vd, rate, partial, secure); break; case POOL_TRIM_CANCEL: vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); break; case POOL_TRIM_SUSPEND: vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_trim_lock); return (0); } /* * Initiates a manual TRIM for the requested vdevs. This kicks off individual * TRIM threads for each child vdev. These threads pass over all of the free * space in the vdev's metaslabs and issues TRIM commands for that space. */ int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping TRIM. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the TRIM operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, rate, partial, secure, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all TRIM threads to stop. */ vdev_trim_stop_wait(spa, &vd_list); /* Sync out the TRIM state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; const char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && !vdev_is_concrete(vd))) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = SET_ERROR(EINVAL); break; } } /* deal with indirect vdevs */ if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == &vdev_indirect_ops) continue; /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = SET_ERROR(EINVAL); break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = SET_ERROR(ENODEV); break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = SET_ERROR(EINVAL); break; } if (vdev_dtl_required(vml[c]) || vdev_resilver_needed(vml[c], NULL, NULL)) { error = SET_ERROR(EBUSY); break; } /* we need certain info from the top level */ fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift); /* transfer per-vdev ZAPs */ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_TOP_ZAP, vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ nvl = fnvlist_alloc(); fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)); VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); /* * Temporarily stop the initializing and TRIM activity. We set the * state to ACTIVE so that we know to resume initializing or TRIM * once the split has completed. */ list_t vd_initialize_list; list_create(&vd_initialize_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); list_t vd_trim_list; list_create(&vd_trim_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { mutex_enter(&vml[c]->vdev_initialize_lock); vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); mutex_exit(&vml[c]->vdev_initialize_lock); mutex_enter(&vml[c]->vdev_trim_lock); vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); mutex_exit(&vml[c]->vdev_trim_lock); } } vdev_initialize_stop_wait(spa, &vd_initialize_list); vdev_trim_stop_wait(spa, &vd_trim_list); list_destroy(&vd_initialize_list); list_destroy(&vd_trim_list); newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; newspa->spa_is_splitting = B_TRUE; /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { newspa->spa_config_splitting = fnvlist_alloc(); fnvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { vdev_t *tvd = vml[c]->vdev_top; /* * Need to be sure the detachable VDEV is not * on any *other* txg's DTL list to prevent it * from being accessed after it's freed. */ for (int t = 0; t < TXG_SIZE; t++) { (void) txg_list_remove_this( &tvd->vdev_dtl_list, vml[c], t); } vdev_split(vml[c]); if (error == 0) spa_history_log_internal(spa, "detach", tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } spa->spa_avz_action = AVZ_ACTION_REBUILD; vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(newspa, "split", NULL, "from pool %s", spa_name(spa)); newspa->spa_is_splitting = B_FALSE; kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } /* restart initializing or trimming disks as necessary */ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. * Also potentially update faulted state. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); vdev_propagate_state(vd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); /* * If a detach was not performed above replace waiters will not have * been notified. In which case we must do so now. */ spa_notify_waiters(spa); } /* * Update the stored path or FRU for this vdev. */ static int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); } int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { return (spa_scan_range(spa, func, 0, 0)); } int spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart, uint64_t txgend) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (SET_ERROR(ENOTSUP)); if (func == POOL_SCAN_RESILVER && !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) return (SET_ERROR(ENOTSUP)); if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) return (SET_ERROR(ENOTSUP)); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } if (func == POOL_SCAN_ERRORSCRUB && !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) return (SET_ERROR(ENOTSUP)); return (dsl_scan(spa->spa_dsl_pool, func, txgstart, txgend)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel) { if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); /* Tell userspace that the vdev is gone. */ zfs_post_remove(spa, vd, by_kernel); } for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c], by_kernel); } static void spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend) { if (vd->vdev_fault_wanted) { vdev_state_t newstate = VDEV_STATE_FAULTED; vd->vdev_fault_wanted = B_FALSE; /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { newstate = VDEV_STATE_DEGRADED; /* A required disk is missing so suspend the pool */ *suspend = B_TRUE; } vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED); } for (int c = 0; c < vd->vdev_children; c++) spa_async_fault_vdev(vd->vdev_child[c], suspend); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { if (!spa->spa_autoexpand) return; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); } static __attribute__((noreturn)) void spa_async_thread(void *arg) { spa_t *spa = (spa_t *)arg; dsl_pool_t *dp = spa->spa_dsl_pool; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks = 0; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); old_space += metaslab_class_get_space( spa_embedded_log_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); new_space += metaslab_class_get_space(spa_special_class(spa)); new_space += metaslab_class_get_space(spa_dedup_class(spa)); new_space += metaslab_class_get_space( spa_embedded_log_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), (u_longlong_t)new_space, (u_longlong_t)(new_space - old_space)); } } /* * See if any devices need to be marked REMOVED. */ if (tasks & (SPA_ASYNC_REMOVE | SPA_ASYNC_REMOVE_BY_USER)) { boolean_t by_kernel = B_TRUE; if (tasks & SPA_ASYNC_REMOVE_BY_USER) by_kernel = B_FALSE; spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev, by_kernel); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i], by_kernel); for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i], by_kernel); (void) spa_vdev_state_exit(spa, NULL, 0); } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be marked faulted. */ if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); boolean_t suspend = B_FALSE; spa_async_fault_vdev(spa->spa_root_vdev, &suspend); (void) spa_vdev_state_exit(spa, NULL, 0); if (suspend) zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE || tasks & SPA_ASYNC_REBUILD_DONE || tasks & SPA_ASYNC_DETACH_SPARE) { spa_vdev_resilver_done(spa); } /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER && !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_TRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache whole device TRIM. */ if (tasks & SPA_ASYNC_L2CACHE_TRIM) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_l2arc(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache rebuilding. */ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); l2arc_spa_rebuild_start(spa); spa_config_exit(spa, SCL_L2ARC, FTAG); mutex_exit(&spa_namespace_lock); } /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); spa_vdev_remove_suspend(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_cancel(condense_thread); zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; if (raidz_expand_thread != NULL) zthr_cancel(raidz_expand_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_cancel(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_cancel(ll_condense_thread); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); spa_restart_removal(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_resume(condense_thread); zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; if (raidz_expand_thread != NULL) zthr_resume(raidz_expand_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_resume(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_resume(ll_condense_thread); } static boolean_t spa_async_tasks_pending(spa_t *spa) { uint_t non_config_tasks; uint_t config_task; boolean_t config_task_suspended; non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; if (spa->spa_ccw_fail_time == 0) { config_task_suspended = B_FALSE; } else { config_task_suspended = (gethrtime() - spa->spa_ccw_fail_time) < ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); } return (non_config_tasks || (config_task && !config_task_suspended)); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa_async_tasks_pending(spa) && !spa->spa_async_suspended && spa->spa_async_thread == NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); } int spa_async_tasks(spa_t *spa) { return (spa->spa_async_tasks); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, bp_freed, tx); return (0); } int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); } int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *pio = arg; zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, pio->io_flags)); return (0); } static int bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (spa_free_sync_cb(arg, bp, tx)); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. */ static void spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing deferred frees. */ static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; /* * Note: * If the log space map feature is active, we stop deferring * frees to the next TXG and therefore running this function * would be considered a no-op as spa_deferred_bpobj should * not have any entries. * * That said we run this function anyway (instead of returning * immediately) for the edge-case scenario where we just * activated the log space map feature in this TXG but we have * deferred frees from the previous TXG. */ zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, bpobj_spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = vmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); memset(packed + nvsize, 0, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); vmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } nvroot = fnvlist_alloc(); if (sav->sav_count == 0) { fnvlist_add_nvlist_array(nvroot, config, (const nvlist_t * const *)NULL, 0); } else { list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); fnvlist_add_nvlist_array(nvroot, config, (const nvlist_t * const *)list, sav->sav_count); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } /* * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. * The all-vdev ZAP must be empty. */ static void spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; if (vd->vdev_root_zap != 0 && spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_root_zap, tx)); } if (vd->vdev_top_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_top_zap, tx)); } if (vd->vdev_leaf_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_leaf_zap, tx)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { spa_avz_build(vd->vdev_child[i], avz, tx); } } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; /* * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, * its config may not be dirty but we still need to build per-vdev ZAPs. * Similarly, if the pool is being assembled (e.g. after a split), we * need to rebuild the AVZ although the config may not be dirty. */ if (list_is_empty(&spa->spa_config_dirty_list) && spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || spa->spa_avz_action == AVZ_ACTION_INITIALIZE || spa->spa_all_vdev_zaps != 0); if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { /* Make and build the new AVZ */ uint64_t new_avz = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); spa_avz_build(spa->spa_root_vdev, new_avz, tx); /* Diff old AVZ with new one */ zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { uint64_t vdzap = za->za_first_integer; if (zap_lookup_int(spa->spa_meta_objset, new_avz, vdzap) == ENOENT) { /* * ZAP is listed in old AVZ but not in new one; * destroy it */ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, tx)); } } zap_cursor_fini(&zc); zap_attribute_free(za); /* Destroy the old AVZ */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); /* Replace the old AVZ in the dir obj with the new one */ VERIFY0(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, sizeof (new_avz), 1, &new_avz, tx)); spa->spa_all_vdev_zaps = new_avz; } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); /* Walk through the AVZ and destroy all listed ZAPs */ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { uint64_t zap = za->za_first_integer; VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); } zap_cursor_fini(&zc); zap_attribute_free(za); /* Destroy and unlink the AVZ itself */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); spa->spa_all_vdev_zaps = 0; } if (spa->spa_all_vdev_zaps == 0) { spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx); } spa->spa_avz_action = AVZ_ACTION_NONE; /* Create ZAPs for vdevs that don't have them. */ vdev_construct_zaps(spa->spa_root_vdev, tx); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); /* * If we're upgrading the spa version then make sure that * the config object gets updated with the correct version. */ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa->spa_uberblock.ub_version); spa_config_exit(spa, SCL_STATE, FTAG); nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } static void spa_sync_version(void *arg, dmu_tx_t *tx) { uint64_t *versionp = arg; uint64_t version = *versionp; spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. */ ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "version=%lld", (longlong_t)version); } /* * Set zpool properties. */ static void spa_sync_props(void *arg, dmu_tx_t *tx) { nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; const char *strval, *fname; zpool_prop_t prop; const char *propname; const char *elemname = nvpair_name(elem); zprop_type_t proptype; spa_feature_t fid; switch (prop = zpool_name_to_prop(elemname)) { case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* * The version is synced separately before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persistent * properties. */ break; case ZPOOL_PROP_COMMENT: strval = fnvpair_value_string(elem); if (spa->spa_comment != NULL) spa_strfree(spa->spa_comment); spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs * so that their labels get updated. We also need to * update the cache file to keep it in sync with the * MOS version. It's unnecessary to do this for pool * creation since the vdev's configuration has already * been dirtied. */ if (tx->tx_txg != TXG_INITIAL) { vdev_config_dirty(spa->spa_root_vdev); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } spa_history_log_internal(spa, "set", tx, "%s=%s", elemname, strval); break; case ZPOOL_PROP_COMPATIBILITY: strval = fnvpair_value_string(elem); if (spa->spa_compatibility != NULL) spa_strfree(spa->spa_compatibility); spa->spa_compatibility = spa_strdup(strval); /* * Dirty the configuration on vdevs as above. */ if (tx->tx_txg != TXG_INITIAL) { vdev_config_dirty(spa->spa_root_vdev); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; case ZPOOL_PROP_INVAL: if (zpool_prop_feature(elemname)) { fname = strchr(elemname, '@') + 1; VERIFY0(zfeature_lookup_name(fname, &fid)); spa_feature_enable(spa, fid, tx); spa_history_log_internal(spa, "set", tx, "%s=enabled", elemname); break; } else if (!zfs_prop_user(elemname)) { ASSERT(zpool_prop_feature(elemname)); break; } zfs_fallthrough; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { spa->spa_pool_props_object = zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, tx); } /* normalize the property name */ if (prop == ZPOOL_PROP_INVAL) { propname = elemname; proptype = PROP_TYPE_STRING; } else { propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); } if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); if (strlen(strval) == 0) { /* remove the property if value == "" */ (void) zap_remove(mos, spa->spa_pool_props_object, propname, tx); } else { VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); } spa_history_log_internal(spa, "set", tx, "%s=%s", elemname, strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(zpool_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%lld", elemname, (longlong_t)intval); switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOTRIM: spa->spa_autotrim = intval; spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_MULTIHOST: spa->spa_multihost = intval; break; case ZPOOL_PROP_DEDUP_TABLE_QUOTA: spa->spa_dedup_table_quota = intval; break; default: break; } } else { ASSERT(0); /* not allowed */ } } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; dsl_pool_t *dp = spa->spa_dsl_pool; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } /* * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable * when possibility to use lz4 compression for metadata was added * Old pools that have this feature enabled must be upgraded to have * this feature active */ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { boolean_t lz4_en = spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS); boolean_t lz4_ac = spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS); if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } /* * If we haven't written the salt, do so now. Note that the * feature may not be activated yet, but that's fine since * the presence of this ZAP entry is backwards compatible. */ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT) == ENOENT) { VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes, tx)); } rrw_exit(&dp->dp_config_rwlock, FTAG); } static void vdev_indirect_state_sync_verify(vdev_t *vd) { vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(vim != NULL); ASSERT(vib != NULL); } uint64_t obsolete_sm_object = 0; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, space_map_allocated(vd->vdev_obsolete_sm)); } ASSERT(vd->vdev_obsolete_segments != NULL); /* * Since frees / remaps to an indirect vdev can only * happen in syncing context, the obsolete segments * tree must be empty when we start syncing. */ ASSERT0(zfs_range_tree_space(vd->vdev_obsolete_segments)); } /* * Set the top-level vdev's max queue depth. Evaluate each top-level's * async write queue depth in case it changed. The max queue depth will * not change in the middle of syncing out this txg. */ static void spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) { ASSERT(spa_writeable(spa)); metaslab_class_balance(spa_normal_class(spa), B_TRUE); metaslab_class_balance(spa_special_class(spa), B_TRUE); metaslab_class_balance(spa_dedup_class(spa), B_TRUE); } static void spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) { ASSERT(spa_writeable(spa)); vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); if (vdev_indirect_should_condense(vd)) { spa_condense_indirect_start_sync(vd, tx); break; } } } static void spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) { objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; uint64_t txg = tx->tx_txg; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free || spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { /* * If the log space map feature is active we don't * care about deferred frees and the deferred bpobj * as the log space map should effectively have the * same results (i.e. appending only to one object). */ spa_sync_frees(spa, free_bpl, tx); } else { /* * We can not defer frees in pass 1, because * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, &spa->spa_deferred_bpobj, tx); } brt_sync(spa, txg); ddt_sync(spa, txg); dsl_scan_sync(dp, tx); dsl_errorscrub_sync(dp, tx); svr_sync(spa, tx); spa_sync_upgrades(spa, tx); spa_flush_metaslabs(spa, tx); vdev_t *vd = NULL; while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) != NULL) vdev_sync(vd, txg); if (pass == 1) { /* * dsl_pool_sync() -> dp_sync_tasks may have dirtied * the config. If that happens, this txg should not * be a no-op. So we must sync the config to the MOS * before checking for no-op. * * Note that when the config is dirty, it will * be written to the MOS (i.e. the MOS will be * dirtied) every time we call spa_sync_config_object() * in this txg. Therefore we can't call this after * dsl_pool_sync() every pass, because it would * prevent us from converging, since we'd dirty * the MOS every pass. * * Sync tasks can only be processed in pass 1, so * there's no need to do this in later passes. */ spa_sync_config_object(spa, tx); } /* * Note: We need to check if the MOS is dirty because we could * have marked the MOS dirty without updating the uberblock * (e.g. if we have sync tasks but no dirty user data). We need * to check the uberblock's rootbp because it is updated if we * have synced out dirty data (though in this case the MOS will * most likely also be dirty due to second order effects, we * don't want to rely on that here). */ if (pass == 1 && BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, therefore this * TXG is a no-op. Avoid syncing deferred frees, so * that we can keep this TXG as a no-op. */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); break; } spa_sync_deferred_frees(spa, tx); } while (dmu_objset_is_dirty(mos, txg)); } /* * Rewrite the vdev configuration (which includes the uberblock) to * commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few random * top-level vdevs that are known to be visible in the config cache * (see spa_vdev_add() for a complete description). If there *are* dirty * vdevs, sync the uberblock to all vdevs. */ static void spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg = tx->tx_txg; for (;;) { int error = 0; /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = random_in_range(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, txg); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); } if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); zio_resume_wait(spa); } } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { vdev_t *vd = NULL; VERIFY(spa_writeable(spa)); /* * Wait for i/os issued in open context that need to complete * before this txg syncs. */ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* * Now that there can be no more cloning in this transaction group, * but we are still before issuing frees, we can process pending BRT * updates. */ brt_pending_apply(spa, txg); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { /* Avoid holding the write lock unless actually necessary */ if (vd->vdev_aux == NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); continue; } /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); dsl_pool_t *dp = spa->spa_dsl_pool; dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { vdev_t *rvd = spa->spa_root_vdev; int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } spa_sync_adjust_vdev_max_queue_depth(spa); spa_sync_condense_indirect(spa, tx); spa_sync_iterate_to_convergence(spa, tx); #ifdef ZFS_DEBUG if (!list_is_empty(&spa->spa_config_dirty_list)) { /* * Make sure that the number of ZAPs for all the vdevs matches * the number of ZAPs in the per-vdev ZAP list. This only gets * called if the config is dirty; otherwise there may be * outstanding AVZ operations that weren't completed in * spa_sync_config_object. */ uint64_t all_vdev_zap_entry_count; ASSERT0(zap_count(spa->spa_meta_objset, spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, all_vdev_zap_entry_count); } #endif if (spa->spa_vdev_removal != NULL) { ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); } spa_sync_rewrite_vdev_config(spa, tx); dmu_tx_commit(tx); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = 0; /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } dsl_pool_sync_done(dp, txg); /* * Update usable space statistics. */ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) != NULL) vdev_sync_done(vd, txg); metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); /* spa_embedded_log_class has only one metaslab per vdev. */ metaslab_class_evict_old(spa->spa_special_class, txg); metaslab_class_evict_old(spa->spa_dedup_class, txg); spa_sync_close_syncing_log_sm(spa); spa_update_dspace(spa); if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) vdev_autotrim_kick(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); while (zfs_pause_spa_sync) delay(1); spa->spa_sync_pass = 0; /* * Update the last synced uberblock here. We want to do this at * the end of spa_sync() so that consumers of spa_last_synced_txg() * will be guaranteed that all the processing associated with * that txg has been completed. */ spa->spa_ubsync = spa->spa_uberblock; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } taskq_t * spa_sync_tq_create(spa_t *spa, const char *name) { kthread_t **kthreads; ASSERT(spa->spa_sync_tq == NULL); ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); /* * - do not allow more allocators than cpus. * - there may be more cpus than allocators. * - do not allow more sync taskq threads than allocators or cpus. */ int nthreads = spa->spa_alloc_count; spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * nthreads, KM_SLEEP); spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); VERIFY(spa->spa_sync_tq != NULL); VERIFY(kthreads != NULL); spa_syncthread_info_t *ti = spa->spa_syncthreads; for (int i = 0; i < nthreads; i++, ti++) { ti->sti_thread = kthreads[i]; ti->sti_allocator = i; } kmem_free(kthreads, sizeof (*kthreads) * nthreads); return (spa->spa_sync_tq); } void spa_sync_tq_destroy(spa_t *spa) { ASSERT(spa->spa_sync_tq != NULL); taskq_wait(spa->spa_sync_tq); taskq_destroy(spa->spa_sync_tq); kmem_free(spa->spa_syncthreads, sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); spa->spa_sync_tq = NULL; } uint_t spa_acq_allocator(spa_t *spa) { int i; if (spa->spa_alloc_count == 1) return (0); mutex_enter(&spa->spa_allocs_use->sau_lock); uint_t r = spa->spa_allocs_use->sau_rotor; do { if (++r == spa->spa_alloc_count) r = 0; } while (spa->spa_allocs_use->sau_inuse[r]); spa->spa_allocs_use->sau_inuse[r] = B_TRUE; spa->spa_allocs_use->sau_rotor = r; mutex_exit(&spa->spa_allocs_use->sau_lock); spa_syncthread_info_t *ti = spa->spa_syncthreads; for (i = 0; i < spa->spa_alloc_count; i++, ti++) { if (ti->sti_thread == curthread) { ti->sti_allocator = r; break; } } ASSERT3S(i, <, spa->spa_alloc_count); return (r); } void spa_rel_allocator(spa_t *spa, uint_t allocator) { if (spa->spa_alloc_count > 1) spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; } void spa_select_allocator(zio_t *zio) { zbookmark_phys_t *bm = &zio->io_bookmark; spa_t *spa = zio->io_spa; ASSERT(zio->io_type == ZIO_TYPE_WRITE); /* * A gang block (for example) may have inherited its parent's * allocator, in which case there is nothing further to do here. */ if (ZIO_HAS_ALLOCATOR(zio)) return; ASSERT(spa != NULL); ASSERT(bm != NULL); /* * First try to use an allocator assigned to the syncthread, and set * the corresponding write issue taskq for the allocator. * Note, we must have an open pool to do this. */ if (spa->spa_sync_tq != NULL) { spa_syncthread_info_t *ti = spa->spa_syncthreads; for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { if (ti->sti_thread == curthread) { zio->io_allocator = ti->sti_allocator; return; } } } /* * We want to try to use as many allocators as possible to help improve * performance, but we also want logically adjacent IOs to be physically * adjacent to improve sequential read performance. We chunk each object * into 2^20 block regions, and then hash based on the objset, object, * level, and region to accomplish both of these goals. */ uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20); zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } static boolean_t spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) { (void) spa; int i; uint64_t vdev_guid; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &vdev_guid) == 0 && vdev_guid == guid) return (B_TRUE); } return (B_FALSE); } boolean_t spa_has_l2cache(spa_t *spa, uint64_t guid) { return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } uint64_t spa_total_metaslabs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t m = 0; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (!vdev_is_concrete(vd)) continue; m += vd->vdev_ms_count; } return (m); } /* * Notify any waiting threads that some activity has switched from being in- * progress to not-in-progress so that the thread can wake up and determine * whether it is finished waiting. */ void spa_notify_waiters(spa_t *spa) { /* * Acquiring spa_activities_lock here prevents the cv_broadcast from * happening between the waiting thread's check and cv_wait. */ mutex_enter(&spa->spa_activities_lock); cv_broadcast(&spa->spa_activities_cv); mutex_exit(&spa->spa_activities_lock); } /* * Notify any waiting threads that the pool is exporting, and then block until * they are finished using the spa_t. */ void spa_wake_waiters(spa_t *spa) { mutex_enter(&spa->spa_activities_lock); spa->spa_waiters_cancel = B_TRUE; cv_broadcast(&spa->spa_activities_cv); while (spa->spa_waiters != 0) cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); spa->spa_waiters_cancel = B_FALSE; mutex_exit(&spa->spa_activities_lock); } /* Whether the vdev or any of its descendants are being initialized/trimmed. */ static boolean_t spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); ASSERT(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? &vd->vdev_initialize_lock : &vd->vdev_trim_lock; mutex_exit(&spa->spa_activities_lock); mutex_enter(lock); mutex_enter(&spa->spa_activities_lock); boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); mutex_exit(lock); if (in_progress) return (B_TRUE); for (int i = 0; i < vd->vdev_children; i++) { if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], activity)) return (B_TRUE); } return (B_FALSE); } /* * If use_guid is true, this checks whether the vdev specified by guid is * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool * is being initialized/trimmed. The caller must hold the config lock and * spa_activities_lock. */ static int spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, zpool_wait_activity_t activity, boolean_t *in_progress) { mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); vdev_t *vd; if (use_guid) { vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (EINVAL); } } else { vd = spa->spa_root_vdev; } *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (0); } /* * Locking for waiting threads * --------------------------- * * Waiting threads need a way to check whether a given activity is in progress, * and then, if it is, wait for it to complete. Each activity will have some * in-memory representation of the relevant on-disk state which can be used to * determine whether or not the activity is in progress. The in-memory state and * the locking used to protect it will be different for each activity, and may * not be suitable for use with a cvar (e.g., some state is protected by the * config lock). To allow waiting threads to wait without any races, another * lock, spa_activities_lock, is used. * * When the state is checked, both the activity-specific lock (if there is one) * and spa_activities_lock are held. In some cases, the activity-specific lock * is acquired explicitly (e.g. the config lock). In others, the locking is * internal to some check (e.g. bpobj_is_empty). After checking, the waiting * thread releases the activity-specific lock and, if the activity is in * progress, then cv_waits using spa_activities_lock. * * The waiting thread is woken when another thread, one completing some * activity, updates the state of the activity and then calls * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only * needs to hold its activity-specific lock when updating the state, and this * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. * * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, * and because it is held when the waiting thread checks the state of the * activity, it can never be the case that the completing thread both updates * the activity state and cv_broadcasts in between the waiting thread's check * and cv_wait. Thus, a waiting thread can never miss a wakeup. * * In order to prevent deadlock, when the waiting thread does its check, in some * cases it will temporarily drop spa_activities_lock in order to acquire the * activity-specific lock. The order in which spa_activities_lock and the * activity specific lock are acquired in the waiting thread is determined by * the order in which they are acquired in the completing thread; if the * completing thread calls spa_notify_waiters with the activity-specific lock * held, then the waiting thread must also acquire the activity-specific lock * first. */ static int spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *in_progress) { int error = 0; ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); switch (activity) { case ZPOOL_WAIT_CKPT_DISCARD: *in_progress = (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == ENOENT); break; case ZPOOL_WAIT_FREE: *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || spa_livelist_delete_check(spa)); break; case ZPOOL_WAIT_INITIALIZE: case ZPOOL_WAIT_TRIM: error = spa_vdev_activity_in_progress(spa, use_tag, tag, activity, in_progress); break; case ZPOOL_WAIT_REPLACE: mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); break; case ZPOOL_WAIT_REMOVE: *in_progress = (spa->spa_removing_phys.sr_state == DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: *in_progress = vdev_rebuild_active(spa->spa_root_vdev); if (*in_progress) break; zfs_fallthrough; case ZPOOL_WAIT_SCRUB: { boolean_t scanning, paused, is_scrub; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); scanning = (scn->scn_phys.scn_state == DSS_SCANNING); paused = dsl_scan_is_paused_scrub(scn); *in_progress = (scanning && !paused && is_scrub == (activity == ZPOOL_WAIT_SCRUB)); break; } case ZPOOL_WAIT_RAIDZ_EXPAND: { vdev_raidz_expand_t *vre = spa->spa_raidz_expand; *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); break; } default: panic("unrecognized value for activity %d", activity); } return (error); } static int spa_wait_common(const char *pool, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *waited) { /* * The tag is used to distinguish between instances of an activity. * 'initialize' and 'trim' are the only activities that we use this for. * The other activities can only have a single instance in progress in a * pool at one time, making the tag unnecessary. * * There can be multiple devices being replaced at once, but since they * all finish once resilvering finishes, we don't bother keeping track * of them individually, we just wait for them all to finish. */ if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && activity != ZPOOL_WAIT_TRIM) return (EINVAL); if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) return (EINVAL); spa_t *spa; int error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); /* * Increment the spa's waiter count so that we can call spa_close and * still ensure that the spa_t doesn't get freed before this thread is * finished with it when the pool is exported. We want to call spa_close * before we start waiting because otherwise the additional ref would * prevent the pool from being exported or destroyed throughout the * potentially long wait. */ mutex_enter(&spa->spa_activities_lock); spa->spa_waiters++; spa_close(spa, FTAG); *waited = B_FALSE; for (;;) { boolean_t in_progress; error = spa_activity_in_progress(spa, activity, use_tag, tag, &in_progress); if (error || !in_progress || spa->spa_waiters_cancel) break; *waited = B_TRUE; if (cv_wait_sig(&spa->spa_activities_cv, &spa->spa_activities_lock) == 0) { error = EINTR; break; } } spa->spa_waiters--; cv_signal(&spa->spa_waiters_cv); mutex_exit(&spa->spa_activities_lock); return (error); } /* * Wait for a particular instance of the specified activity to complete, where * the instance is identified by 'tag' */ int spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited) { return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); } /* * Wait for all instances of the specified activity complete */ int spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) { return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); } sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; #ifdef _KERNEL nvlist_t *resource; resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); if (resource) { ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); ev->resource = resource; } #else (void) spa, (void) vd, (void) hist_nvl, (void) name; #endif return (ev); } void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL if (ev) { zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); kmem_free(ev, sizeof (*ev)); } #else (void) ev; #endif } /* * Post a zevent corresponding to the given sysevent. The 'name' must be one * of the event definitions in sys/sysevent/eventdefs.h. The payload will be * filled in from the spa and (optionally) the vdev. This doesn't do anything * in the userland libzpool, as we don't want consumers to misinterpret ztest * or zdb as real changes. */ void spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } /* state manipulation functions */ EXPORT_SYMBOL(spa_open); EXPORT_SYMBOL(spa_open_rewind); EXPORT_SYMBOL(spa_get_stats); EXPORT_SYMBOL(spa_create); EXPORT_SYMBOL(spa_import); EXPORT_SYMBOL(spa_tryimport); EXPORT_SYMBOL(spa_destroy); EXPORT_SYMBOL(spa_export); EXPORT_SYMBOL(spa_reset); EXPORT_SYMBOL(spa_async_request); EXPORT_SYMBOL(spa_async_suspend); EXPORT_SYMBOL(spa_async_resume); EXPORT_SYMBOL(spa_inject_addref); EXPORT_SYMBOL(spa_inject_delref); EXPORT_SYMBOL(spa_scan_stat_init); EXPORT_SYMBOL(spa_scan_get_stats); /* device manipulation */ EXPORT_SYMBOL(spa_vdev_add); EXPORT_SYMBOL(spa_vdev_attach); EXPORT_SYMBOL(spa_vdev_detach); EXPORT_SYMBOL(spa_vdev_setpath); EXPORT_SYMBOL(spa_vdev_setfru); EXPORT_SYMBOL(spa_vdev_split_mirror); /* spare statech is global across all pools) */ EXPORT_SYMBOL(spa_spare_add); EXPORT_SYMBOL(spa_spare_remove); EXPORT_SYMBOL(spa_spare_exists); EXPORT_SYMBOL(spa_spare_activate); /* L2ARC statech is global across all pools) */ EXPORT_SYMBOL(spa_l2cache_add); EXPORT_SYMBOL(spa_l2cache_remove); EXPORT_SYMBOL(spa_l2cache_exists); EXPORT_SYMBOL(spa_l2cache_activate); EXPORT_SYMBOL(spa_l2cache_drop); /* scanning */ EXPORT_SYMBOL(spa_scan); EXPORT_SYMBOL(spa_scan_range); EXPORT_SYMBOL(spa_scan_stop); /* spa syncing */ EXPORT_SYMBOL(spa_sync); /* only for DMU use */ EXPORT_SYMBOL(spa_sync_allpools); /* properties */ EXPORT_SYMBOL(spa_prop_set); EXPORT_SYMBOL(spa_prop_get); EXPORT_SYMBOL(spa_prop_clear_bootfs); /* asynchronous event notification */ EXPORT_SYMBOL(spa_event_notify); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, "Percentage of CPUs to run a metaslab preload taskq"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, "log2 fraction of arc that can be used by inflight I/Os when " "verifying pool during import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, "Set to traverse metadata on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, "Set to traverse data on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, "Print vdev tree to zfs_dbgmsg during pool import"); ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, "Percentage of CPUs to run an IO worker thread"); ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, "Number of threads per IO worker taskqueue"); ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, "Set the livelist condense synctask to pause"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the synctask"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the zthr function"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); #ifdef _KERNEL ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, "Configure IO queues for read IO"); ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, "Configure IO queues for write IO"); #endif ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, "Number of CPUs per write issue taskq"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 554de6b7d38f..9d80062e54ab 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1,5869 +1,5871 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, 2023, 2024, 2025, Klara, Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, Datto, Inc. * Copyright (c) 2021, 2024 by George Melikov. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *const zio_type_name[ZIO_TYPES] = { /* * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim" }; int zio_dva_throttle_enabled = B_TRUE; static int zio_deadman_log_all = B_FALSE; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ static kmem_cache_t *zio_cache; static kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif /* Mark IOs as "slow" if they take longer than 30 seconds */ static uint_t zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. * * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable * compression (including of metadata). In practice, we don't have this * many sync passes, so this has no effect. * * The original intent was that disabling compression would help the sync * passes to converge. However, in practice disabling compression increases * the average number of sync passes, because when we turn compression off, a * lot of block's size will change and thus we have to re-allocate (not * overwrite) them. It also increases the number of 128KB allocations (e.g. * for indirect blocks and spacemaps) because these will not be compressed. * The 128K allocations are especially detrimental to performance on highly * fragmented systems, which may have very few free segments of this size, * and may need to load new metaslabs to satisfy 128K allocations. */ /* defer frees starting in this pass */ uint_t zfs_sync_pass_deferred_free = 2; /* don't compress starting in this pass */ static uint_t zfs_sync_pass_dont_compress = 8; /* rewrite new bps starting in this pass */ static uint_t zfs_sync_pass_rewrite = 2; /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) /* * Enable smaller cores by excluding metadata * allocations as well. */ int zio_exclude_metadata = 0; static int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG static const int zio_buf_debug_limit = 16384; #else static const int zio_buf_debug_limit = 0; #endif typedef struct zio_stats { kstat_named_t ziostat_total_allocations; kstat_named_t ziostat_alloc_class_fallbacks; kstat_named_t ziostat_gang_writes; kstat_named_t ziostat_gang_multilevel; } zio_stats_t; static zio_stats_t zio_stats = { { "total_allocations", KSTAT_DATA_UINT64 }, { "alloc_class_fallbacks", KSTAT_DATA_UINT64 }, { "gang_writes", KSTAT_DATA_UINT64 }, { "gang_multilevel", KSTAT_DATA_UINT64 }, }; struct { wmsum_t ziostat_total_allocations; wmsum_t ziostat_alloc_class_fallbacks; wmsum_t ziostat_gang_writes; wmsum_t ziostat_gang_multilevel; } ziostat_sums; #define ZIOSTAT_BUMP(stat) wmsum_add(&ziostat_sums.stat, 1); static kstat_t *zio_ksp; static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); static int zio_kstats_update(kstat_t *ksp, int rw) { zio_stats_t *zs = ksp->ks_data; if (rw == KSTAT_WRITE) return (EACCES); zs->ziostat_total_allocations.value.ui64 = wmsum_value(&ziostat_sums.ziostat_total_allocations); zs->ziostat_alloc_class_fallbacks.value.ui64 = wmsum_value(&ziostat_sums.ziostat_alloc_class_fallbacks); zs->ziostat_gang_writes.value.ui64 = wmsum_value(&ziostat_sums.ziostat_gang_writes); zs->ziostat_gang_multilevel.value.ui64 = wmsum_value(&ziostat_sums.ziostat_gang_multilevel); return (0); } void zio_init(void) { size_t c; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); wmsum_init(&ziostat_sums.ziostat_total_allocations, 0); wmsum_init(&ziostat_sums.ziostat_alloc_class_fallbacks, 0); wmsum_init(&ziostat_sums.ziostat_gang_writes, 0); wmsum_init(&ziostat_sums.ziostat_gang_multilevel, 0); zio_ksp = kstat_create("zfs", 0, "zio_stats", "misc", KSTAT_TYPE_NAMED, sizeof (zio_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zio_ksp != NULL) { zio_ksp->ks_data = &zio_stats; zio_ksp->ks_update = zio_kstats_update; kstat_install(zio_ksp); } for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t align, cflags, data_cflags; char name[32]; /* * Create cache for each half-power of 2 size, starting from * SPA_MINBLOCKSIZE. It should give us memory space efficiency * of ~7/8, sufficient for transient allocations mostly using * these caches. */ size_t p2 = size; while (!ISP2(p2)) p2 &= p2 - 1; if (!IS_P2ALIGNED(size, p2 / 2)) continue; #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; #endif if (IS_P2ALIGNED(size, PAGESIZE)) align = PAGESIZE; else align = 1 << (highbit64(size ^ (size - 1)) - 1); cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; data_cflags = KMC_NODEBUG; if (abd_size_alloc_linear(size)) { cflags |= KMC_RECLAIMABLE; data_cflags |= KMC_RECLAIMABLE; } if (cflags == data_cflags) { /* * Resulting kmem caches would be identical. * Save memory by creating only one. */ (void) snprintf(name, sizeof (name), "zio_buf_comb_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); zio_data_buf_cache[c] = zio_buf_cache[c]; continue; } (void) snprintf(name, sizeof (name), "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, data_cflags); } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } zio_inject_init(); lz4_init(); } void zio_fini(void) { size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; #if defined(ZFS_DEBUG) && !defined(_KERNEL) for (size_t i = 0; i < n; i++) { if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i]) (void) printf("zio_fini: [%d] %llu != %llu\n", (int)((i + 1) << SPA_MINBLOCKSHIFT), (long long unsigned)zio_buf_cache_allocs[i], (long long unsigned)zio_buf_cache_frees[i]); } #endif /* * The same kmem cache can show up multiple times in both zio_buf_cache * and zio_data_buf_cache. Do a wasteful but trivially correct scan to * sort it out. */ for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_buf_cache[j]) zio_buf_cache[j] = NULL; if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_data_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { VERIFY3P(zio_buf_cache[i], ==, NULL); VERIFY3P(zio_data_buf_cache[i], ==, NULL); } if (zio_ksp != NULL) { kstat_delete(zio_ksp); zio_ksp = NULL; } wmsum_fini(&ziostat_sums.ziostat_total_allocations); wmsum_fini(&ziostat_sums.ziostat_alloc_class_fallbacks); wmsum_fini(&ziostat_sums.ziostat_gang_writes); wmsum_fini(&ziostat_sums.ziostat_gang_multilevel); kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); lz4_fini(); } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ #if defined(ZFS_DEBUG) && defined(_KERNEL) #define ZFS_ZIO_BUF_CANARY 1 #endif #ifdef ZFS_ZIO_BUF_CANARY static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b; /* * Use empty space after the buffer to detect overflows. * * Since zio_init() creates kmem caches only for certain set of buffer sizes, * allocations of different sizes may have some unused space after the data. * Filling part of that space with a known pattern on allocation and checking * it on free should allow us to detect some buffer overflows. */ static void zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) { size_t off = P2ROUNDUP(size, sizeof (ulong_t)); ulong_t *canary = p + off / sizeof (ulong_t); size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && cache[c] == cache[c + 1]) asize = (c + 2) << SPA_MINBLOCKSHIFT; for (; off < asize; canary++, off += sizeof (ulong_t)) *canary = zio_buf_canary; } static void zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) { size_t off = P2ROUNDUP(size, sizeof (ulong_t)); ulong_t *canary = p + off / sizeof (ulong_t); size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && cache[c] == cache[c + 1]) asize = (c + 2) << SPA_MINBLOCKSHIFT; for (; off < asize; canary++, off += sizeof (ulong_t)) { if (unlikely(*canary != zio_buf_canary)) { PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx", p, size, (canary - p) * sizeof (ulong_t), *canary, zio_buf_canary); } } } #endif /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE); #ifdef ZFS_ZIO_BUF_CANARY zio_buf_put_canary(p, size, zio_buf_cache, c); #endif return (p); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE); #ifdef ZFS_ZIO_BUF_CANARY zio_buf_put_canary(p, size, zio_data_buf_cache, c); #endif return (p); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif #ifdef ZFS_ZIO_BUF_CANARY zio_buf_check_canary(buf, size, zio_buf_cache, c); #endif kmem_cache_free(zio_buf_cache[c], buf); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #ifdef ZFS_ZIO_BUF_CANARY zio_buf_check_canary(buf, size, zio_data_buf_cache, c); #endif kmem_cache_free(zio_data_buf_cache[c], buf); } static void zio_abd_free(void *abd, size_t size) { (void) size; abd_free((abd_t *)abd); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ void zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_abd = data; zio->io_size = size; } void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) abd_free(zio->io_abd); zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks, decompression, and decryption * ========================================================================== */ static void zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) abd_copy(data, zio->io_abd, size); } static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_abd, data, zio->io_size, size, &zio->io_prop.zp_complevel); if (zio_injection_enabled && ret == 0) ret = zio_handle_fault_injection(zio, EINVAL); if (ret != 0) zio->io_error = SET_ERROR(EIO); } } static void zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) { int ret; void *tmp; blkptr_t *bp = zio->io_bp; spa_t *spa = zio->io_spa; uint64_t dsobj = zio->io_bookmark.zb_objset; uint64_t lsize = BP_GET_LSIZE(bp); dmu_object_type_t ot = BP_GET_TYPE(bp); uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(size, !=, 0); if (zio->io_error != 0) return; /* * Verify the cksum of MACs stored in an indirect bp. It will always * be possible to verify this since it does not require an encryption * key. */ if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { zio_crypt_decode_mac_bp(bp, mac); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* * We haven't decompressed the data yet, but * zio_crypt_do_indirect_mac_checksum() requires * decompressed data to be able to parse out the MACs * from the indirect block. We decompress it now and * throw away the result after we are finished. */ abd_t *abd = abd_alloc_linear(lsize, B_TRUE); ret = zio_decompress_data(BP_GET_COMPRESS(bp), zio->io_abd, abd, zio->io_size, lsize, &zio->io_prop.zp_complevel); if (ret != 0) { abd_free(abd); ret = SET_ERROR(EIO); goto error; } ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, abd, lsize, BP_SHOULD_BYTESWAP(bp), mac); abd_free(abd); } else { ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); } abd_copy(data, zio->io_abd, size); if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } if (ret != 0) goto error; return; } /* * If this is an authenticated block, just check the MAC. It would be * nice to separate this out into its own flag, but when this was done, * we had run out of bits in what is now zio_flag_t. Future cleanup * could make this a flag bit. */ if (BP_IS_AUTHENTICATED(bp)) { if (ot == DMU_OT_OBJSET) { ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); } else { zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, mac); if (zio_injection_enabled && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } } abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; } zio_crypt_decode_params_bp(bp, salt, iv); if (ot == DMU_OT_INTENT_LOG) { tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmp, mac); abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, mac); } ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, zio->io_abd, &no_crypt); if (no_crypt) abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; error: /* assert that the key was found unless this was speculative */ ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); /* * If there was a decryption / authentication error return EIO as * the io_error. If this was not a speculative zio, create an ereport. */ if (ret == ECKSUM) { zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark, BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; } } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ zio_t * zio_walk_parents(zio_t *cio, zio_link_t **zl) { list_t *pl = &cio->io_parent_list; *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_child == cio); return ((*zl)->zl_parent); } zio_t * zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; ASSERT(MUTEX_HELD(&pio->io_lock)); *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_parent == pio); return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(cio, &zl); VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } static void zio_add_child_impl(zio_t *pio, zio_t *cio, boolean_t first) { /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); /* Parent should not have READY stage if child doesn't have it. */ IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 && (cio->io_child_type != ZIO_CHILD_VDEV), (pio->io_pipeline & ZIO_STAGE_READY) == 0); zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&pio->io_lock); if (first) ASSERT(list_is_empty(&cio->io_parent_list)); else mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); if (!first) mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } void zio_add_child(zio_t *pio, zio_t *cio) { zio_add_child_impl(pio, cio, B_FALSE); } static void zio_add_child_first(zio_t *pio, zio_t *cio) { zio_add_child_impl(pio, cio, B_TRUE); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) { boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); for (int c = 0; c < ZIO_CHILD_TYPES; c++) { if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) continue; uint64_t *countp = &zio->io_children[c][wait]; if (*countp != 0) { zio->io_stage >>= 1; ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; break; } } mutex_exit(&zio->io_lock); return (waiting); } __attribute__((always_inline)) static inline void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, zio_t **next_to_executep) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); /* * Propogate the Direct I/O checksum verify failure to the parent. */ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; (*countp)--; if (*countp == 0 && pio->io_stall == countp) { zio_taskq_type_t type = pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); /* * If we can tell the caller to execute this parent next, do * so. We do this if the parent's zio type matches the child's * type, or if it's a zio_null() with no done callback, and so * has no actual work to do. Otherwise dispatch the parent zio * in its own taskq. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch * overhead, and has no recursion penalty. Note that one * read from disk typically causes at least 3 zio's: a * zio_null(), the logical zio_read(), and then a physical * zio. When the physical ZIO completes, we are able to call * zio_done() on all 3 of these zio's from one invocation of * zio_execute() by returning the parent back to * zio_execute(). Since the parent isn't executed until this * thread returns back to zio_execute(), the caller should do * so promptly. * * In other cases, dispatching the parent prevents * overflowing the stack when we have deeply nested * parent-child relationships, as we do with the "mega zio" * of writes for spa_sync(), and the chain of ZIL blocks. */ if (next_to_executep != NULL && *next_to_executep == NULL && (pio->io_type == zio->io_type || (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); } } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } int zio_bookmark_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) return (-1); if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) return (1); if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) return (-1); if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) return (1); if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) return (-1); if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) return (1); if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) return (-1); if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, zio_flag_t flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); memset(zio, 0, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) { zio->io_bp_copy = *bp; zio->io_bp = &zio->io_bp_copy; /* so caller can free */ } else { zio->io_bp = (blkptr_t *)bp; } zio->io_bp_orig = *bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; if (flags & ZIO_FLAG_PREALLOCATED) { BP_ZERO_DVAS(zio->io_bp); BP_SET_BIRTH(zio->io_bp, 0, 0); } } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_allocator = ZIO_ALLOCATOR_NONE; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) || (pipeline & ZIO_STAGE_READY) == 0; zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child_first(pio, zio); } taskq_init_ent(&zio->io_tqent); return (zio); } void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } /* * ZIO intended to be between others. Provides synchronization at READY * and DONE pipeline stages and calls the respective callbacks. */ zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } /* * ZIO intended to be a root of a tree. Unlike null ZIO does not have a * READY pipeline stage (is ready on creation), so it should not be used * as child of any ZIO that may need waiting for grandchildren READY stage * (any other ZIO type). */ zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE); return (zio); } static int zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, enum blk_verify_flag blk_verify, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("bad blkptr at %px: " "DVA[0]=%#llx/%#llx " "DVA[1]=%#llx/%#llx " "DVA[2]=%#llx/%#llx " "prop=%#llx " "pad=%#llx,%#llx " "phys_birth=%#llx " "birth=%#llx " "fill=%#llx " "cksum=%#llx/%#llx/%#llx/%#llx", bp, (long long)bp->blk_dva[0].dva_word[0], (long long)bp->blk_dva[0].dva_word[1], (long long)bp->blk_dva[1].dva_word[0], (long long)bp->blk_dva[1].dva_word[1], (long long)bp->blk_dva[2].dva_word[0], (long long)bp->blk_dva[2].dva_word[1], (long long)bp->blk_prop, (long long)bp->blk_pad[0], (long long)bp->blk_pad[1], (long long)BP_GET_PHYSICAL_BIRTH(bp), (long long)BP_GET_LOGICAL_BIRTH(bp), (long long)bp->blk_fill, (long long)bp->blk_cksum.zc_word[0], (long long)bp->blk_cksum.zc_word[1], (long long)bp->blk_cksum.zc_word[2], (long long)bp->blk_cksum.zc_word[3]); switch (blk_verify) { case BLK_VERIFY_HALT: zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: zfs_dbgmsg("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_ONLY: break; } return (1); } /* * Verify the block pointer fields contain reasonable values. This means * it only contains known object types, checksum/compression identifiers, * block sizes within the maximum allowed limits, valid DVAs, etc. * * If everything checks out 0 is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * * Values for blk_verify_flag: * BLK_VERIFY_ONLY: evaluate the block * BLK_VERIFY_LOG: evaluate the block and log problems * BLK_VERIFY_HALT: call zfs_panic_recover on error * * Values for blk_config_flag: * BLK_CONFIG_HELD: caller holds SCL_VDEV for writer * BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be * obtained for reader * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better * performance */ int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { int errors = 0; if (unlikely(!DMU_OT_IS_VALID(BP_GET_TYPE(bp)))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (unlikely(BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (unlikely(BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (unlikely(BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } if (unlikely(BPE_GET_PSIZE(bp) > BPE_PAYLOAD_SIZE)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BPE_GET_PSIZE(bp)); } return (errors ? ECKSUM : 0); } else if (BP_IS_HOLE(bp)) { /* * Holes are allowed (expected, even) to have no DVAs, no * checksum, and no psize. */ return (errors ? ECKSUM : 0); } else if (unlikely(!DVA_IS_VALID(&bp->blk_dva[0]))) { /* Non-hole, non-embedded BPs _must_ have at least one DVA */ errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has no valid DVAs", bp); } if (unlikely(BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (unlikely(BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } /* * Do not verify individual DVAs if the config is not trusted. This * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (unlikely(!spa->spa_trust_config)) return (errors ? ECKSUM : 0); switch (blk_config) { case BLK_CONFIG_HELD: ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); break; case BLK_CONFIG_NEEDED: spa_config_enter(spa, SCL_VDEV, bp, RW_READER); break; case BLK_CONFIG_NEEDED_TRY: if (!spa_config_tryenter(spa, SCL_VDEV, bp, RW_READER)) return (EBUSY); break; case BLK_CONFIG_SKIP: return (errors ? ECKSUM : 0); default: panic("invalid blk_config %u", blk_config); } /* * Pool-specific checks. * * Note: it would be nice to verify that the logical birth * and physical birth are not too large. However, * spa_freeze() allows the birth time of log blocks (and * dmu_sync()-ed blocks that are in the log) to be arbitrarily * large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { const dva_t *dva = &bp->blk_dva[i]; uint64_t vdevid = DVA_GET_VDEV(dva); if (unlikely(vdevid >= spa->spa_root_vdev->vdev_children)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (unlikely(vd == NULL)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (unlikely(vd->vdev_ops == &vdev_hole_ops)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (unlikely(offset + asize > vd->vdev_asize)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } if (blk_config == BLK_CONFIG_NEEDED || blk_config == BLK_CONFIG_NEEDED_TRY) spa_config_exit(spa, SCL_VDEV, bp); return (errors ? ECKSUM : 0); } boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) { (void) bp; uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) return (B_FALSE); vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) return (B_FALSE); if (vd->vdev_ops == &vdev_hole_ops) return (B_FALSE); if (vd->vdev_ops == &vdev_missing_ops) { return (B_FALSE); } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) return (B_FALSE); return (B_TRUE); } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ? ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE; zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, pipeline); zio->io_ready = ready; zio->io_children_ready = children_ready; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). Encrypted * dedup blocks need data as well so we also disable dedup in this * case. */ if (data == NULL && (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies, boolean_t nopwrite, boolean_t brtwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); ASSERT(!brtwrite || !nopwrite); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_brtwrite = brtwrite; zio->io_prop.zp_copies = copies; zio->io_prop.zp_gang_copies = gang_copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. * * Note that we only defer frees after zfs_sync_pass_deferred_free * when the log space map feature is disabled. [see relevant comment * in spa_sync_iterate_to_convergence()] */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) || brt_maybe_exists(spa, bp)) { metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); } } /* * To improve performance, this function may return NULL if we were able * to do the free immediately. This avoids the cost of creating a zio * (and linking it to the parent, etc). */ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_flag_t flags) { ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); if (BP_IS_EMBEDDED(bp)) return (NULL); metaslab_check_free(spa, bp); arc_freed(spa, bp); dsl_scan_freed(spa, bp); if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || brt_maybe_exists(spa, bp)) { /* * GANG, DEDUP and BRT blocks can induce a read (for the gang * block header, the DDT or the BRT), so issue them * asynchronously so that this thread is not tied up. */ enum zio_stage stage = ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage)); } else { metaslab_free(spa, bp, txg, B_FALSE); return (NULL); } } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ? BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); } zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, enum trim_flag trim_flags) { zio_t *zio; ASSERT0(vd->vdev_children); ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift)); ASSERT3U(size, !=, 0); zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done, private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE); zio->io_trim_flags = trim_flags; return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ abd_t *wbuf = abd_alloc_sametype(data, size); abd_copy(wbuf, data, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; /* * vdev child I/Os do not propagate their error to the parent. * Therefore, for correct operation the caller *must* check for * and handle the error in the child i/o's done callback. * The only exceptions are i/os that we don't care about * (OPTIONAL or REPAIR). */ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || done != NULL); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; /* * We never allow the mirror VDEV to attempt reading from any * additional data copies after the first Direct I/O checksum * verify failure. This is to avoid bad data being written out * through the mirror during self healing. See comment in * vdev_mirror_io_done() for more details. */ ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); } else if (type == ZIO_TYPE_WRITE && pio->io_prop.zp_direct_write == B_TRUE) { /* * By default we only will verify checksums for Direct I/O * writes for Linux. FreeBSD is able to place user pages under * write protection before issuing them to the ZIO pipeline. * * Checksum validation errors will only be reported through * the top-level VDEV, which is set by this child ZIO. */ ASSERT3P(bp, !=, NULL); ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { ASSERT0(vd->vdev_children); offset += VDEV_LABEL_START_SIZE; } flags |= ZIO_VDEV_CHILD_FLAGS(pio); /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; /* * If we're creating a child I/O that is not associated with a * top-level vdev, then the child zio is not an allocating I/O. * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); flags &= ~ZIO_FLAG_IO_ALLOCATING; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, zio_type_t type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } /* * Send a flush command to the given vdev. Unlike most zio creation functions, * the flush zios are issued immediately. You can wait on pio to pause until * the flushes complete. */ void zio_flush(zio_t *pio, vdev_t *vd) { const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY; if (vd->vdev_nowritecache) return; if (vd->vdev_children == 0) { zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0, NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE)); } else { for (uint64_t c = 0; c < vd->vdev_children; c++) zio_flush(pio, vd->vdev_child[c]); } } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT3P(zio->io_executor, ==, NULL); ASSERT3U(zio->io_orig_size, ==, zio->io_size); ASSERT3U(size, <=, zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) { /* we are not doing a raw write */ ASSERT3U(zio->io_size, ==, zio->io_lsize); zio->io_orig_size = zio->io_size = zio->io_lsize = size; } } /* * Round provided allocation size up to a value that can be allocated * by at least some vdev(s) in the pool with minimum or no additional * padding and without extra space usage on others */ static uint64_t zio_roundup_alloc_size(spa_t *spa, uint64_t size) { if (size > spa->spa_min_alloc) return (roundup(size, spa->spa_gcd_alloc)); return (spa->spa_min_alloc); } size_t zio_get_compression_max_size(enum zio_compress compress, uint64_t gcd_alloc, uint64_t min_alloc, size_t s_len) { size_t d_len; /* minimum 12.5% must be saved (legacy value, may be changed later) */ d_len = s_len - (s_len >> 3); /* ZLE can't use exactly d_len bytes, it needs more, so ignore it */ if (compress == ZIO_COMPRESS_ZLE) return (d_len); d_len = d_len - d_len % gcd_alloc; if (d_len < min_alloc) return (BPE_PAYLOAD_SIZE); return (d_len); } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static zio_t * zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || BP_HAS_INDIRECT_MAC_CKSUM(bp)) && zio->io_child_type == ZIO_CHILD_LOGICAL) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decrypt); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (zio); } static zio_t * zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) return (zio); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zp->zp_brtwrite) return (zio); ASSERT(!BP_GET_DEDUP(zio->io_bp_override)); if (BP_IS_EMBEDDED(bp)) return (zio); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (zio); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (zio); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && !zp->zp_encrypt) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (zio); } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } return (zio); } static zio_t * zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; uint32_t pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { return (NULL); } if (!IO_IS_ALLOCATING(zio)) return (zio); if (zio->io_children_ready != NULL) { /* * Now that all our children are ready, run the callback * associated with this zio in case it wants to modify the * data to be written. */ ASSERT3U(zp->zp_level, >, 0); zio->io_children_ready(zio); } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) || MIN(zp->zp_copies, spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { abd_t *cabd = NULL; if (abd_cmp_zero(zio->io_abd, lsize) == 0) psize = 0; else if (compress == ZIO_COMPRESS_EMPTY) psize = lsize; else psize = zio_compress_data(compress, zio->io_abd, &cabd, lsize, zio_get_compression_max_size(compress, spa->spa_gcd_alloc, spa->spa_min_alloc, lsize), zp->zp_complevel); if (psize == 0) { compress = ZIO_COMPRESS_OFF; } else if (psize >= lsize) { compress = ZIO_COMPRESS_OFF; if (cabd != NULL) abd_free(cabd); } else if (psize <= BPE_PAYLOAD_SIZE && !zp->zp_encrypt && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { void *cbuf = abd_borrow_buf_copy(cabd, lsize); encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); abd_return_buf(cabd, cbuf, lsize); abd_free(cabd); BP_SET_LOGICAL_BIRTH(bp, zio->io_txg); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (zio); } else { /* * Round compressed size up to the minimum allocation * size of the smallest-ashift device, and zero the * tail. This ensures that the compressed size of the * BP (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ size_t rounded = (size_t)zio_roundup_alloc_size(spa, psize); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; abd_free(cabd); psize = lsize; } else { abd_zero_off(cabd, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cabd, psize, lsize, NULL); } } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && zp->zp_type == DMU_OT_DNODE) { /* * The DMU actually relies on the zio layer's compression * to free metadnode blocks that have had all contained * dnodes freed. As a result, even when doing a raw * receive, we must check whether the block can be compressed * to a hole. */ if (abd_cmp_zero(zio->io_abd, lsize) == 0) { psize = 0; compress = ZIO_COMPRESS_OFF; } else { psize = lsize; } } else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) { /* * If we are raw receiving an encrypted dataset we should not * take this codepath because it will change the on-disk block * and decryption will fail. */ size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize), lsize); if (rounded != psize) { abd_t *cdata = abd_alloc_linear(rounded, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); abd_copy_off(cdata, zio->io_abd, 0, 0, psize); psize = rounded; zio_push_transform(zio, cdata, psize, rounded, NULL); } } else { ASSERT3U(psize, !=, 0); } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!zp->zp_encrypt || DMU_OT_IS_ENCRYPTED(zp->zp_type)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (zio); } static zio_t * zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); return (zio); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available or cut the line otherwise. */ if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) { if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; else cutinline = B_TRUE; } ASSERT3U(q, <, ZIO_TASKQ_TYPES); spa_taskq_dispatch(spa, t, q, zio_execute, zio, cutinline); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { spa_t *spa = zio->io_spa; taskq_t *tq = taskq_of_curthread(); for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (tqs->stqs_taskq[i] == tq) return (B_TRUE); } } return (B_FALSE); } static zio_t * zio_issue_async(zio_t *zio) { ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio)); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } void zio_interrupt(void *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } void zio_delay_interrupt(zio_t *zio) { /* * The timeout_generic() function isn't defined in userspace, so * rather than trying to implement the function, the zio delay * functionality has been disabled for userspace builds. */ #ifdef _KERNEL /* * If io_target_timestamp is zero, then no delay has been registered * for this IO, thus jump to the end of this function and "skip" the * delay; issuing it directly to the zio layer. */ if (zio->io_target_timestamp != 0) { hrtime_t now = gethrtime(); if (now >= zio->io_target_timestamp) { /* * This IO has already taken longer than the target * delay to complete, so we don't want to delay it * any longer; we "miss" the delay and issue it * directly to the zio layer. This is likely due to * the target latency being set to a value less than * the underlying hardware can satisfy (e.g. delay * set to 1ms, but the disks take 10ms to complete an * IO request). */ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, hrtime_t, now); zio_interrupt(zio); } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; int ticks = MAX(1, NSEC_TO_TICK(diff)); clock_t expire_at_tick = ddi_get_lbolt() + ticks; DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); tid = taskq_dispatch_delay(system_taskq, zio_interrupt, zio, TQ_NOSLEEP, expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just finish the * zio without a delay. */ zio_interrupt(zio); } } return; } #endif DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); zio_interrupt(zio); } static void zio_deadman_impl(zio_t *pio, int ziodepth) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; vdev_t *vd = pio->io_vd; uint64_t failmode = spa_get_deadman_failmode(pio->io_spa); if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) { vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL; zbookmark_phys_t *zb = &pio->io_bookmark; uint64_t delta = gethrtime() - pio->io_timestamp; zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu " "delta=%llu queued=%llu io=%llu " "path=%s " "last=%llu type=%d " "priority=%d flags=0x%llx stage=0x%x " "pipeline=0x%x pipeline-trace=0x%x " "objset=%llu object=%llu " "level=%llu blkid=%llu " "offset=%llu size=%llu " "error=%d", ziodepth, pio, pio->io_timestamp, (u_longlong_t)delta, pio->io_delta, pio->io_delay, vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, pio->io_type, pio->io_priority, (u_longlong_t)pio->io_flags, pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size, pio->io_error); (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, pio->io_spa, vd, zb, pio, 0); } if (vd != NULL && vd->vdev_ops->vdev_op_leaf && list_is_empty(&pio->io_child_list) && failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent) && pio->io_queue_state == ZIO_QS_ACTIVE) { pio->io_error = EINTR; zio_interrupt(pio); } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_deadman_impl(cio, ziodepth + 1); } mutex_exit(&pio->io_lock); } /* * Log the critical information describing this zio and all of its children * using the zfs_dbgmsg() interface then post deadman event for the ZED. */ void zio_deadman(zio_t *pio, const char *tag) { spa_t *spa = pio->io_spa; char *name = spa_name(spa); if (!zfs_deadman_enabled || spa_suspended(spa)) return; zio_deadman_impl(pio, 0); switch (spa_get_deadman_failmode(spa)) { case ZIO_FAILURE_MODE_WAIT: zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_CONTINUE: zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_PANIC: fm_panic("%s determined I/O to pool '%s' is hung.", tag, name); break; } } /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for * dependent child I/Os; (3) the I/O issues, so we're waiting * for an I/O completion interrupt; (4) the I/O is delegated by * vdev-level caching or aggregation; (5) the I/O is deferred * due to vdev-level queueing; (6) the I/O is handed off to * another thread. In all cases, the pipeline stops whenever * there's no CPU work; it never burns a thread in cv_wait_io(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; /* * zio_execute() is a wrapper around the static function * __zio_execute() so that we can force __zio_execute() to be * inlined. This reduces stack overhead which is important * because __zio_execute() is called recursively in several zio * code paths. zio_execute() itself cannot be inlined because * it is externally visible. */ void zio_execute(void *zio) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); __zio_execute(zio); spl_fstrans_unmark(cookie); } /* * Used to determine if in the current context the stack is sized large * enough to allow zio_execute() to be called recursively. A minimum * stack size of 16K is required to avoid needing to re-dispatch the zio. */ static boolean_t zio_execute_stack_check(zio_t *zio) { #if !defined(HAVE_LARGE_STACKS) dsl_pool_t *dp = spa_get_dsl(zio->io_spa); /* Executing in txg_sync_thread() context. */ if (dp && curthread == dp->dp_tx.tx_sync_thread) return (B_TRUE); /* Pool initialization outside of zio_taskq context. */ if (dp && spa_is_initializing(dp->dp_spa) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) return (B_TRUE); #else (void) zio; #endif /* HAVE_LARGE_STACKS */ return (B_FALSE); } __attribute__((always_inline)) static inline void __zio_execute(zio_t *zio) { ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; zio->io_executor = curthread; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } /* * If the current context doesn't have large enough stacks * the zio must be issued asynchronously to prevent overflow. */ if (zio_execute_stack_check(zio)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; /* * The zio pipeline stage returns the next zio to execute * (typically the same as this one), or NULL if we should * stop. */ zio = zio_pipeline[highbit64(stage) - 1](zio); if (zio == NULL) return; } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { /* * Some routines, like zio_free_sync(), may return a NULL zio * to avoid the performance overhead of creating and then destroying * an unneeded zio. For the callers' simplicity, we accept a NULL * zio and ignore it. */ if (zio == NULL) return (0); long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); int error; ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); ASSERT3P(zio->io_executor, ==, NULL); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); if (zio->io_type == ZIO_TYPE_WRITE) { spa_select_allocator(zio); } __zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) { error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, ddi_get_lbolt() + timeout); if (zfs_deadman_enabled && error == -1 && gethrtime() - zio->io_queued_timestamp > spa_deadman_ziotime(zio->io_spa)) { mutex_exit(&zio->io_lock); timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms); zio_deadman(zio, FTAG); mutex_enter(&zio->io_lock); } } mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { /* * See comment in zio_wait(). */ if (zio == NULL) return; ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && list_is_empty(&zio->io_parent_list)) { zio_t *pio; /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE]; zio_add_child(pio, zio); } ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); if (zio->io_type == ZIO_TYPE_WRITE) { spa_select_allocator(zio); } __zio_execute(zio); } /* * ========================================================================== * Reexecute, cancel, or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(void *arg) { zio_t *pio = arg; zio_t *cio, *cio_next, *gio; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); mutex_enter(&pio->io_lock); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) || (pio->io_pipeline & ZIO_STAGE_READY) == 0; pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE); /* * It's possible for a failed ZIO to be a descendant of more than one * ZIO tree. When reexecuting it, we have to be sure to add its wait * states to all parent wait counts. * * Those parents, in turn, may have other children that are currently * active, usually because they've already been reexecuted after * resuming. Those children may be executing and may call * zio_notify_parent() at the same time as we're updating our parent's * counts. To avoid races while updating the counts, we take * gio->io_lock before each update. */ zio_link_t *zl = NULL; while ((gio = zio_walk_parents(pio, &zl)) != NULL) { mutex_enter(&gio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) { gio->io_children[pio->io_child_type][w] += !pio->io_state[w]; } mutex_exit(&gio->io_lock); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ zl = NULL; for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock); } mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on it. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); } } void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); if (reason != ZIO_SUSPEND_MMP) { cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " "I/O failure and has been suspended.", spa_name(spa)); } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = reason; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); txg_wait_kick(spa->spa_dsl_pool); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspended != ZIO_SUSPEND_NONE) cmn_err(CE_WARN, "Pool '%s' was suspended and is being " "resumed. Failed I/O will be retried.", spa_name(spa)); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static void zio_gang_issue_func_done(zio_t *zio) { abd_free(zio->io_abd); } static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } static zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio; if (gn != NULL) { abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { abd_t *buf = abd_get_offset(data, offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); abd_free(buf); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio)); if (zio == NULL) { zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)); } return (zio); } static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(list_is_empty(&zio->io_child_list)); if (zio->io_error) return; /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); abd_free(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, offset); offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); } static zio_t * zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (zio); } static zio_t * zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 0); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } static void zio_gang_inherit_allocator(zio_t *pio, zio_t *cio) { cio->io_allocator = pio->io_allocator; } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; zio_t *gio __maybe_unused = zio->io_gang_leader; if (BP_IS_HOLE(zio->io_bp)) return; /* * If we're getting direct-invoked from zio_write_gang_block(), * the bp_orig will be set. */ ASSERT(BP_IS_HOLE(&zio->io_bp_orig) || zio->io_flags & ZIO_FLAG_PREALLOCATED); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static void zio_write_gang_done(zio_t *zio) { /* * The io_abd field will be NULL for a zio with no data. The io_flags * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't * check for it here as it is cleared in zio_ready. */ if (zio->io_abd != NULL) abd_free(zio->io_abd); } static zio_t * zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* * Store multiple copies of the GBH, so that we can still traverse * all the data (e.g. to free or scrub) even if a block is damaged. * This value respects the redundant_metadata property. */ int gbh_copies = gio->io_prop.zp_gang_copies; if (gbh_copies == 0) { /* * This should only happen in the case where we're filling in * DDT entries for a parent that wants more copies than the DDT * has. In that case, we cannot gang without creating a mixed * blkptr, which is illegal. */ ASSERT3U(gio->io_child_type, ==, ZIO_CHILD_DDT); pio->io_error = EAGAIN; return (pio); } ASSERT3S(gbh_copies, >, 0); ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP); ASSERT(ZIO_HAS_ALLOCATOR(pio)); int flags = METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio->io_allocator, pio); if (error) { pio->io_error = error; return (pio); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; memset(gbh, 0, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); zio_gang_inherit_allocator(pio, zio); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { boolean_t more; VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies, zio, B_TRUE, &more)); } /* * Create and nowait the gang children. First, we try to do * opportunistic allocations. If that fails to generate enough * space, we fall back to normal zio_write calls for nested gang. */ for (int g = 0; resid != 0; g++) { flags &= METASLAB_ASYNC_ALLOC; flags |= METASLAB_GANG_CHILD; zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_complevel = gio->io_prop.zp_complevel; zp.zp_type = zp.zp_storage_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_gang_copies = gio->io_prop.zp_gang_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; zp.zp_direct_write = B_FALSE; memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); uint64_t min_size = zio_roundup_alloc_size(spa, resid / (SPA_GBH_NBLKPTRS - g)); min_size = MIN(min_size, resid); bp = &gbh->zg_blkptr[g]; zio_alloc_list_t cio_list; metaslab_trace_init(&cio_list); uint64_t allocated_size = UINT64_MAX; error = metaslab_alloc_range(spa, mc, min_size, resid, bp, gio->io_prop.zp_copies, txg, NULL, flags, &cio_list, zio->io_allocator, NULL, &allocated_size); boolean_t allocated = error == 0; uint64_t psize = allocated ? MIN(resid, allocated_size) : min_size; zio_t *cio = zio_write(zio, spa, txg, bp, has_data ? abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, psize, psize, &zp, zio_write_gang_member_ready, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio) | (allocated ? ZIO_FLAG_PREALLOCATED : 0), &pio->io_bookmark); resid -= psize; zio_gang_inherit_allocator(zio, cio); if (allocated) { metaslab_trace_move(&cio_list, &cio->io_alloc_list); metaslab_group_alloc_increment_all(spa, &cio->io_bp_orig, zio->io_allocator, flags, psize, cio); } /* * We do not reserve for the child writes, since we already * reserved for the parent. Unreserve though will be called * for individual children. We can do this since sum of all * child's physical sizes is equal to parent's physical size. * It would not work for potentially bigger allocation sizes. */ zio_nowait(cio); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio_nowait(zio); return (pio); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static zio_t * zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_IS_HOLE(bp)); ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (zio); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop); /* * If we're overwriting a block that is currently on an * indirect vdev, then ignore the nopwrite request and * allow a new block to be allocated on a concrete vdev. */ spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) { vdev_t *tvd = vdev_lookup_top(zio->io_spa, DVA_GET_VDEV(&bp_orig->blk_dva[d])); if (tvd->vdev_ops == &vdev_indirect_ops) { spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); return (zio); } } spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (zio); } /* * ========================================================================== * Block Reference Table * ========================================================================== */ static zio_t * zio_brt_free(zio_t *zio) { blkptr_t *bp; bp = zio->io_bp; if (BP_GET_LEVEL(bp) > 0 || BP_IS_METADATA(bp) || !brt_maybe_exists(zio->io_spa, bp)) { return (zio); } if (!brt_entry_decref(zio->io_spa, bp)) { /* * This isn't the last reference, so we cannot free * the data yet. */ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } return (zio); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_t *ddt; ddt_entry_t *dde = zio->io_private; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddt = ddt_select(zio->io_spa, bp); if (zio->io_error == 0) { ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); /* this phys variant doesn't need repair */ ddt_phys_clear(dde->dde_phys, v); } if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL) dde->dde_io->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } static zio_t * zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp); ddt_univ_phys_t *ddp = dde->dde_phys; blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (v_self == DDT_PHYS_NONE) return (zio); /* issue I/O for the other copies */ for (int p = 0; p < DDT_NPHYS(ddt); p++) { ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); if (ddt_phys_birth(ddp, v) == 0 || v == v_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, v, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (zio); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (zio); } static zio_t * zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (zio); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } if (dde->dde_io->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (zio); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. * However, we should never get a raw, override zio so in these * cases we can compare the io_abd directly. This is useful because * it allows us to do dedup verification even if we don't have access * to the original data (for instance, if the encryption keys aren't * loaded). */ for (int p = 0; p < DDT_NPHYS(ddt); p++) { if (DDT_PHYS_IS_DITTO(ddt, p)) continue; if (dde->dde_io == NULL) continue; zio_t *lio = dde->dde_io->dde_lead_zio[p]; if (lio == NULL) continue; if (do_raw) return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } for (int p = 0; p < DDT_NPHYS(ddt); p++) { ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v); if (phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) return (B_TRUE); ddt_exit(ddt); tmpabd = abd_alloc_for_io(psize, B_TRUE); error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_done(zio_t *zio) { ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; zio_link_t *zl = NULL; ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); ddt_univ_phys_t *ddp = dde->dde_phys; ddt_enter(ddt); /* we're the lead, so once we're done there's no one else outstanding */ if (dde->dde_io->dde_lead_zio[p] == zio) dde->dde_io->dde_lead_zio[p] = NULL; ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys; if (zio->io_error != 0) { /* * The write failed, so we're about to abort the entire IO * chain. We need to revert the entry back to what it was at * the last time it was successfully extended. */ ddt_phys_unextend(ddp, orig, v); ddt_phys_clear(orig, v); ddt_exit(ddt); return; } /* * Add references for all dedup writes that were waiting on the * physical one, skipping any other physical writes that are waiting. */ zio_t *pio; zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) { if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) ddt_phys_addref(ddp, v); } /* * We've successfully added new DVAs to the entry. Clear the saved * state or, if there's still outstanding IO, remember it so we can * revert to a known good state if that IO fails. */ if (dde->dde_io->dde_lead_zio[p] == NULL) ddt_phys_clear(orig, v); else ddt_phys_copy(orig, ddp, v); ddt_exit(ddt); } static void zio_ddt_child_write_ready(zio_t *zio) { ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; zio_link_t *zl = NULL; ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); if (ddt_phys_is_gang(dde->dde_phys, v)) { for (int i = 0; i < BP_GET_NDVAS(zio->io_bp); i++) { dva_t *d = &zio->io_bp->blk_dva[i]; metaslab_group_alloc_decrement(zio->io_spa, DVA_GET_VDEV(d), zio->io_allocator, METASLAB_ASYNC_ALLOC, zio->io_size, zio); } zio->io_error = EAGAIN; } if (zio->io_error != 0) return; ddt_enter(ddt); ddt_phys_extend(dde->dde_phys, v, zio->io_bp); zio_t *pio; zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) { if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg); } ddt_exit(ddt); } static zio_t * zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); /* * Deduplication will not take place for Direct I/O writes. The * ddt_tree will be emptied in syncing context. Direct I/O writes take * place in the open-context. Direct I/O write can not attempt to * modify the ddt_tree while issuing out a write. */ ASSERT3B(zio->io_prop.zp_direct_write, ==, B_FALSE); ddt_enter(ddt); /* * Search DDT for matching entry. Skip DVAs verification here, since * they can go only from override, and once we get here the override * pointer can't have "D" flag to be confused with pruned DDT entries. */ IMPLY(zio->io_bp_override, !BP_GET_DEDUP(zio->io_bp_override)); dde = ddt_lookup(ddt, bp, B_FALSE); if (dde == NULL) { /* DDT size is over its quota so no new entries */ zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); if (zio->io_bp_override == NULL) zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); } ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); ddt_univ_phys_t *ddp = dde->dde_phys; /* * In the common cases, at this point we have a regular BP with no * allocated DVAs, and the corresponding DDT entry for its checksum. * Our goal is to fill the BP with enough DVAs to satisfy its copies= * requirement. * * One of three things needs to happen to fulfill this: * * - if the DDT entry has enough DVAs to satisfy the BP, we just copy * them out of the entry and return; * * - if the DDT entry has no DVAs (ie its brand new), then we have to * issue the write as normal so that DVAs can be allocated and the * data land on disk. We then copy the DVAs into the DDT entry on * return. * * - if the DDT entry has some DVAs, but too few, we have to issue the * write, adjusted to have allocate fewer copies. When it returns, we * add the new DVAs to the DDT entry, and update the BP to have the * full amount it originally requested. * * In all cases, if there's already a writing IO in flight, we need to * defer the action until after the write is done. If our action is to * write, we need to adjust our request for additional DVAs to match * what will be in the DDT entry after it completes. In this way every * IO can be guaranteed to recieve enough DVAs simply by joining the * end of the chain and letting the sequence play out. */ /* * Number of DVAs in the DDT entry. If the BP is encrypted we ignore * the third one as normal. */ int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp)); IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0); boolean_t is_ganged = ddt_phys_is_gang(ddp, v); /* Number of DVAs requested by the IO. */ uint8_t need_dvas = zp->zp_copies; /* Number of DVAs in outstanding writes for this dde. */ uint8_t parent_dvas = 0; /* * What we do next depends on whether or not there's IO outstanding that * will update this entry. */ if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) { /* * No IO outstanding, so we only need to worry about ourselves. */ /* * Override BPs bring their own DVAs and their own problems. */ if (zio->io_bp_override) { /* * For a brand-new entry, all the work has been done * for us, and we can just fill it out from the provided * block and leave. */ if (have_dvas == 0) { ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_extend(ddp, v, bp); ddt_phys_addref(ddp, v); ddt_exit(ddt); return (zio); } /* * If we already have this entry, then we want to treat * it like a regular write. To do this we just wipe * them out and proceed like a regular write. * * Even if there are some DVAs in the entry, we still * have to clear them out. We can't use them to fill * out the dedup entry, as they are all referenced * together by a bp already on disk, and will be freed * as a group. */ BP_ZERO_DVAS(bp); BP_SET_BIRTH(bp, 0, 0); } /* * If there are enough DVAs in the entry to service our request, * then we can just use them as-is. */ if (have_dvas >= need_dvas) { ddt_bp_fill(ddp, v, bp, txg); ddt_phys_addref(ddp, v); ddt_exit(ddt); return (zio); } /* * Otherwise, we have to issue IO to fill the entry up to the * amount we need. */ need_dvas -= have_dvas; } else { /* * There's a write in-flight. If there's already enough DVAs on * the entry, then either there were already enough to start * with, or the in-flight IO is between READY and DONE, and so * has extended the entry with new DVAs. Either way, we don't * need to do anything, we can just slot in behind it. */ if (zio->io_bp_override) { /* * If there's a write out, then we're soon going to * have our own copies of this block, so clear out the * override block and treat it as a regular dedup * write. See comment above. */ BP_ZERO_DVAS(bp); BP_SET_BIRTH(bp, 0, 0); } if (have_dvas >= need_dvas) { /* * A minor point: there might already be enough * committed DVAs in the entry to service our request, * but we don't know which are completed and which are * allocated but not yet written. In this case, should * the IO for the new DVAs fail, we will be on the end * of the IO chain and will also recieve an error, even * though our request could have been serviced. * * This is an extremely rare case, as it requires the * original block to be copied with a request for a * larger number of DVAs, then copied again requesting * the same (or already fulfilled) number of DVAs while * the first request is active, and then that first * request errors. In return, the logic required to * catch and handle it is complex. For now, I'm just * not going to bother with it. */ /* * We always fill the bp here as we may have arrived * after the in-flight write has passed READY, and so * missed out. */ ddt_bp_fill(ddp, v, bp, txg); zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); ddt_exit(ddt); return (zio); } /* * There's not enough in the entry yet, so we need to look at * the write in-flight and see how many DVAs it will have once * it completes. * * The in-flight write has potentially had its copies request * reduced (if we're filling out an existing entry), so we need * to reach in and get the original write to find out what it is * expecting. * * Note that the parent of the lead zio will always have the * highest zp_copies of any zio in the chain, because ones that * can be serviced without additional IO are always added to * the back of the chain. */ zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl); ASSERT(pio); parent_dvas = pio->io_prop.zp_copies; if (parent_dvas >= need_dvas) { zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); ddt_exit(ddt); return (zio); } /* * Still not enough, so we will need to issue to get the * shortfall. */ need_dvas -= parent_dvas; } if (is_ganged) { zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } /* * We need to write. We will create a new write with the copies * property adjusted to match the number of DVAs we need to need to * grow the DDT entry by to satisfy the request. */ zio_prop_t czp = *zp; if (have_dvas > 0 || parent_dvas > 0) { czp.zp_copies = need_dvas; czp.zp_gang_copies = 0; } else { ASSERT3U(czp.zp_copies, ==, need_dvas); } zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); /* * We are the new lead zio, because our parent has the highest * zp_copies that has been requested for this entry so far. */ ddt_alloc_entry_io(dde); if (dde->dde_io->dde_lead_zio[p] == NULL) { /* * First time out, take a copy of the stable entry to revert * to if there's an error (see zio_ddt_child_write_done()) */ ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v); } else { /* * Make the existing chain our child, because it cannot * complete until we have. */ zio_add_child(cio, dde->dde_io->dde_lead_zio[p]); } dde->dde_io->dde_lead_zio[p] = cio; ddt_exit(ddt); zio_nowait(cio); return (zio); } static ddt_entry_t *freedde; /* for debugging */ static zio_t * zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde = NULL; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); if (v != DDT_PHYS_NONE) ddt_phys_decref(dde->dde_phys, v); } ddt_exit(ddt); /* * When no entry was found, it must have been pruned, * so we can free it now instead of decrementing the * refcount in the DDT. */ if (!dde) { BP_SET_DEDUP(bp, 0); zio->io_pipeline |= ZIO_STAGE_DVA_FREE; } return (zio); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static zio_t * zio_io_to_allocate(metaslab_class_allocator_t *mca, boolean_t *more) { zio_t *zio; ASSERT(MUTEX_HELD(&mca->mca_lock)); zio = avl_first(&mca->mca_tree); if (zio == NULL) { *more = B_FALSE; return (NULL); } ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, zio->io_prop.zp_copies, zio, B_FALSE, more)) { return (NULL); } avl_remove(&mca->mca_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); if (avl_is_empty(&mca->mca_tree)) *more = B_FALSE; return (zio); } static zio_t * zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; metaslab_class_t *mc; boolean_t more; /* * If not already chosen, choose an appropriate allocation class. */ mc = zio->io_metaslab_class; if (mc == NULL) mc = spa_preferred_class(spa, zio); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (zio); } ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(ZIO_HAS_ALLOCATOR(zio)); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); zio->io_metaslab_class = mc; metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; mutex_enter(&mca->mca_lock); avl_add(&mca->mca_tree, zio); nio = zio_io_to_allocate(mca, &more); mutex_exit(&mca->mca_lock); return (nio); } static void zio_allocate_dispatch(metaslab_class_t *mc, int allocator) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; zio_t *zio; boolean_t more; do { mutex_enter(&mca->mca_lock); zio = zio_io_to_allocate(mca, &more); mutex_exit(&mca->mca_lock); if (zio == NULL) return; ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); ASSERT0(zio->io_error); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } while (more); } static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc, *newmc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } if (zio->io_flags & ZIO_FLAG_PREALLOCATED) { ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_GANG); memcpy(zio->io_bp->blk_dva, zio->io_bp_orig.blk_dva, 3 * sizeof (dva_t)); BP_SET_BIRTH(zio->io_bp, BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig), BP_GET_PHYSICAL_BIRTH(&zio->io_bp_orig)); return (zio); } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; /* * If not already chosen, choose an appropriate allocation class. */ mc = zio->io_metaslab_class; if (mc == NULL) { mc = spa_preferred_class(spa, zio); zio->io_metaslab_class = mc; } ZIOSTAT_BUMP(ziostat_total_allocations); again: /* * Try allocating the block in the usual metaslab class. * If that's full, allocate it in some other class(es). * If that's full, allocate as a gang block, * and if all are full, the allocation fails (which shouldn't happen). * * Note that we do not fall back on embedded slog (ZIL) space, to * preserve unfragmented slog space, which is critical for decent * sync write performance. If a log allocation fails, we will fall * back to spa_sync() which is abysmal for performance. */ ASSERT(ZIO_HAS_ALLOCATOR(zio)); error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio->io_allocator, zio); /* * When the dedup or special class is spilling into the normal class, * there can still be significant space available due to deferred * frees that are in-flight. We track the txg when this occurred and * back off adding new DDT entries for a few txgs to allow the free * blocks to be processed. */ if (error == ENOSPC && spa->spa_dedup_class_full_txg != zio->io_txg && (mc == spa_dedup_class(spa) || (mc == spa_special_class(spa) && !spa_has_dedup(spa) && spa_special_has_ddt(spa)))) { spa->spa_dedup_class_full_txg = zio->io_txg; zfs_dbgmsg("%s[%llu]: %s class spilling, req size %llu, " "%llu allocated of %llu", spa_name(spa), (u_longlong_t)zio->io_txg, - mc == spa_dedup_class(spa) ? "dedup" : "special", + metaslab_class_get_name(mc), (u_longlong_t)zio->io_size, (u_longlong_t)metaslab_class_get_alloc(mc), (u_longlong_t)metaslab_class_get_space(mc)); } /* * Fall back to some other class when this one is full. */ if (error == ENOSPC && (newmc = spa_preferred_class(spa, zio)) != mc) { /* * If we are holding old class reservation, drop it. * Dispatch the next ZIO(s) there if some are waiting. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { if (metaslab_class_throttle_unreserve(mc, zio->io_prop.zp_copies, zio)) { zio_allocate_dispatch(zio->io_metaslab_class, zio->io_allocator); } zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; } if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { - zfs_dbgmsg("%s: metaslab allocation failure, " - "trying fallback: zio %px, size %llu, error %d", - spa_name(spa), zio, (u_longlong_t)zio->io_size, - error); + zfs_dbgmsg("%s: metaslab allocation failure in %s " + "class, trying fallback to %s class: zio %px, " + "size %llu, error %d", spa_name(spa), + metaslab_class_get_name(mc), + metaslab_class_get_name(newmc), + zio, (u_longlong_t)zio->io_size, error); } zio->io_metaslab_class = mc = newmc; ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); /* * If the new class uses throttling, return to that pipeline * stage. Otherwise just do another allocation attempt. */ if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && mc->mc_alloc_throttle_enabled && zio->io_child_type != ZIO_CHILD_GANG && !(zio->io_flags & ZIO_FLAG_NODATA)) { zio->io_stage = ZIO_STAGE_DVA_THROTTLE >> 1; return (zio); } goto again; } if (error == ENOSPC && zio->io_size > spa->spa_min_alloc) { if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying ganging: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } ZIOSTAT_BUMP(ziostat_gang_writes); if (flags & METASLAB_GANG_CHILD) ZIOSTAT_BUMP(ziostat_gang_multilevel); return (zio_write_gang_block(zio, mc)); } if (error != 0) { if (error != ENOSPC || (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) { zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " "size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } zio->io_error = error; } return (zio); } static zio_t * zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (zio); } static zio_t * zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (zio); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) { metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp), B_TRUE); } if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); /* * Block pointer fields are useful to metaslabs for stats and debugging. * Fill in the obvious ones before calling into metaslab_alloc(). */ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_PSIZE(new_bp, size); BP_SET_LEVEL(new_bp, 0); /* * When allocating a zil block, we don't have information about * the final destination of the block except the objset it's part * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ int flags = METASLAB_ZIL; int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; ZIOSTAT_BUMP(ziostat_total_allocations); error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, NULL); *slog = (error == 0); if (error != 0) { error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, NULL); } if (error != 0) { ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, NULL); } metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); /* * encrypted blocks will require an IV and salt. We generate * these now since we will not be rewriting the bp at * rewrite time. */ if (os->os_encrypted) { uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t salt[ZIO_DATA_SALT_LEN]; BP_SET_CRYPT(new_bp, B_TRUE); VERIFY0(spa_crypt_get_salt(spa, dmu_objset_id(os), salt)); VERIFY0(zio_crypt_generate_iv(iv)); zio_crypt_encode_params_bp(new_bp, salt, iv); } } else { zfs_dbgmsg("%s: zil block allocation failure: " "size %llu, error %d", spa_name(spa), (u_longlong_t)size, error); } return (error); } /* * ========================================================================== * Read and write to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static zio_t * zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; zio->io_delay = 0; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (NULL); } ASSERT3P(zio->io_logical, !=, zio); if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(spa->spa_trust_config); /* * Note: the code can handle other kinds of writes, * but we don't expect them. */ if (zio->io_vd->vdev_noalloc) { ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering. * * There are a few ways that we can end up creating these spurious * resilver i/os: * * 1. A resilver i/o will be issued if any DVA in the BP has a * dirty DTL. The mirror code will issue resilver writes to * each DVA, including the one(s) that are not on vdevs with dirty * DTLs. * * 2. With nested replication, which happens when we have a * "replacing" or "spare" vdev that's a child of a mirror or raidz. * For example, given mirror(replacing(A+B), C), it's likely that * only A is out of date (it's the new device). In this case, we'll * read from C, then use the data to resilver A+B -- but we don't * actually want to resilver B, just A. The top-level mirror has no * way to know this, so instead we just discard unnecessary repairs * as we work our way down the vdev tree. * * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. * The same logic applies to any form of nested replication: ditto * + mirror, RAID-Z + replacing, etc. * * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. * * Leaf DTL_PARTIAL can be empty when a legitimate write comes from * a dRAID spare vdev. For example, when a dRAID spare is first * used, its spare blocks need to be written to but the leaf vdev's * of such blocks can have empty DTL_PARTIAL. * * There seemed no clean way to allow such writes while bypassing * spurious ones. At this point, just avoid all bypassing for dRAID * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } /* * Select the next best leaf I/O to process. Distributed spares are * excluded since they dispatch the I/O directly to a leaf vdev after * applying the dRAID mapping. */ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (NULL); } zio->io_delay = gethrtime(); if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) { /* * "no-op" injections return success, but do no actual * work. Just return it. */ zio_delay_interrupt(zio); return (NULL); } } vd->vdev_ops->vdev_op_io_start(zio); return (NULL); } static zio_t * zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FLUSH || zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { if (zio->io_type != ZIO_TYPE_FLUSH) vdev_queue_io_done(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error && zio->io_type != ZIO_TYPE_FLUSH && zio->io_type != ZIO_TYPE_TRIM) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error && vd->vdev_remove_wanted == B_FALSE) VERIFY(vdev_probe(vd, zio) == NULL); return (zio); } /* * This function is used to change the priority of an existing zio that is * currently in-flight. This is used by the arc to upgrade priority in the * event that a demand read is made for a block that is currently queued * as a scrub or async read IO. Otherwise, the high priority read request * would end up having to wait for the lower priority IO. */ void zio_change_priority(zio_t *pio, zio_priority_t priority) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { vdev_queue_change_io_priority(pio, priority); } else { pio->io_priority = priority; } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_change_priority(cio, priority); } mutex_exit(&pio->io_lock); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = abd; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_abd_free; } static zio_t * zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } /* * If a Direct I/O operation has a checksum verify error then this I/O * should not attempt to be issued again. */ if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); ASSERT3U(zio->io_error, ==, EIO); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (NULL); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting " "cant_write=TRUE due to write failure with ENXIO", zio); vd->vdev_cant_write = B_TRUE; } /* * If a cache flush returns ENOTSUP we know that no future * attempts will ever succeed. In this case we set a persistent * boolean flag so that we don't bother with it in the future, and * then we act like the flush succeeded. */ if (zio->io_error == ENOTSUP && zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) { vd->vdev_nowritecache = B_TRUE; zio->io_error = 0; } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Encrypt and store encryption parameters * ========================================================================== */ /* * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for * managing the storage of encryption parameters and passing them to the * lower-level encryption functions. */ static zio_t * zio_encrypt(zio_t *zio) { zio_prop_t *zp = &zio->io_prop; spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_GET_PSIZE(bp); uint64_t dsobj = zio->io_bookmark.zb_objset; dmu_object_type_t ot = BP_GET_TYPE(bp); void *enc_buf = NULL; abd_t *eabd = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* the root zio already encrypted the data */ if (zio->io_child_type == ZIO_CHILD_GANG) return (zio); /* only ZIL blocks are re-encrypted on rewrite */ if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) return (zio); if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { BP_SET_CRYPT(bp, B_FALSE); return (zio); } /* if we are doing raw encryption set the provided encryption params */ if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { ASSERT0(BP_GET_LEVEL(bp)); BP_SET_CRYPT(bp, B_TRUE); BP_SET_BYTEORDER(bp, zp->zp_byteorder); if (ot != DMU_OT_OBJSET) zio_crypt_encode_mac_bp(bp, zp->zp_mac); /* dnode blocks must be written out in the provided byteorder */ if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && ot == DMU_OT_DNODE) { void *bswap_buf = zio_buf_alloc(psize); abd_t *babd = abd_get_from_buf(bswap_buf, psize); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); abd_copy_to_buf(bswap_buf, zio->io_abd, psize); dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, psize); abd_take_ownership_of_buf(babd, B_TRUE); zio_push_transform(zio, babd, psize, psize, NULL); } if (DMU_OT_IS_ENCRYPTED(ot)) zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); return (zio); } /* indirect blocks only maintain a cksum of the lower level MACs */ if (BP_GET_LEVEL(bp) > 0) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Objset blocks are a special case since they have 2 256-bit MACs * embedded within them. */ if (ot == DMU_OT_OBJSET) { ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); return (zio); } /* unencrypted object types are only authenticated with a MAC */ if (!DMU_OT_IS_ENCRYPTED(ot)) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Later passes of sync-to-convergence may decide to rewrite data * in place to avoid more disk reallocations. This presents a problem * for encryption because this constitutes rewriting the new data with * the same encryption key and IV. However, this only applies to blocks * in the MOS (particularly the spacemaps) and we do not encrypt the * MOS. We assert that the zio is allocating or an intent log write * to enforce this. */ ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); ASSERT3U(psize, !=, 0); enc_buf = zio_buf_alloc(psize); eabd = abd_get_from_buf(enc_buf, psize); abd_take_ownership_of_buf(eabd, B_TRUE); /* * For an explanation of what encryption parameters are stored * where, see the block comment in zio_crypt.c. */ if (ot == DMU_OT_INTENT_LOG) { zio_crypt_decode_params_bp(bp, salt, iv); } else { BP_SET_CRYPT(bp, B_TRUE); } /* Perform the encryption. This should not fail */ VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); /* encode encryption metadata into the bp */ if (ot == DMU_OT_INTENT_LOG) { /* * ZIL blocks store the MAC in the embedded checksum, so the * transform must always be applied. */ zio_crypt_encode_mac_zil(enc_buf, mac); zio_push_transform(zio, eabd, psize, psize, NULL); } else { BP_SET_CRYPT(bp, B_TRUE); zio_crypt_encode_params_bp(bp, salt, iv); zio_crypt_encode_mac_bp(bp, mac); if (no_crypt) { ASSERT3U(ot, ==, DMU_OT_DNODE); abd_free(eabd); } else { zio_push_transform(zio, eabd, psize, psize, NULL); } } return (zio); } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static zio_t * zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (zio); } static zio_t * zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ, !(zio->io_flags & ZIO_FLAG_SPECULATIVE)); if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { if (zio->io_flags & ZIO_FLAG_DIO_READ) { zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; zio_t *pio = zio_unique_parent(zio); /* * Any Direct I/O read that has a checksum * error must be treated as suspicous as the * contents of the buffer could be getting * manipulated while the I/O is taking place. * * The checksum verify error will only be * reported here for disk and file VDEV's and * will be reported on those that the failure * occurred on. Other types of VDEV's report the * verify failure in their own code paths. */ if (pio->io_child_type == ZIO_CHILD_LOGICAL) { zio_dio_chksum_verify_error_report(zio); } } else { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_checksum_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, &info); } } } return (zio); } static zio_t * zio_dio_checksum_verify(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); int error; ASSERT3P(zio->io_vd, !=, NULL); ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE); ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0) goto out; if ((error = zio_checksum_error(zio, NULL)) != 0) { zio->io_error = error; if (error == ECKSUM) { zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); } } out: return (zio); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * Report Direct I/O checksum verify error and create ZED event. */ void zio_dio_chksum_verify_error_report(zio_t *zio) { ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); if (zio->io_child_type == ZIO_CHILD_LOGICAL) return; mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_dio_verify_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); if (zio->io_type == ZIO_TYPE_WRITE) { /* * Convert checksum error for writes into EIO. */ zio->io_error = SET_ERROR(EIO); /* * Report dio_verify_wr ZED event. */ (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_WR, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); } else { /* * Report dio_verify_rd ZED event. */ (void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_RD, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); } } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static zio_t * zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } #ifdef ZFS_DEBUG if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; #endif if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ if (metaslab_class_throttle_unreserve( zio->io_metaslab_class, zio->io_prop.zp_copies, zio)) { zio_allocate_dispatch(zio->io_metaslab_class, zio->io_allocator); } } } mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (bp != NULL && BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (zio); } /* * Update the allocation throttle accounting. */ static void zio_dva_throttle_done(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; const void *tag = pio; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT(vd != NULL); ASSERT3P(vd, ==, vd->vdev_top); ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); /* * Parents of gang children can have two flavors -- ones that allocated * the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that * allocated the constituent blocks. The first use their parent as tag. */ if (pio->io_child_type == ZIO_CHILD_GANG && (pio->io_flags & ZIO_FLAG_IO_REWRITE)) tag = zio_unique_parent(pio); ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG && (pio->io_flags & ZIO_FLAG_IO_REWRITE))); ASSERT(ZIO_HAS_ALLOCATOR(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); ASSERT(zio->io_metaslab_class != NULL); ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio->io_allocator, flags, pio->io_size, tag); if (metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio)) { zio_allocate_dispatch(zio->io_metaslab_class, pio->io_allocator); } } static zio_t * zio_done(zio_t *zio) { /* * Always attempt to keep stack usage minimal here since * we can be called recursively up to 19 levels deep. */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; zio_link_t *zl = NULL; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { return (NULL); } /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) zio_dva_throttle_done(zio); for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); abd_t *adata = zio->io_abd; if (adata != NULL && asize != psize) { adata = abd_alloc(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, adata); zfs_ereport_free_checksum(zcr); if (adata != NULL && asize != psize) abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { /* * We want to only increment our slow IO counters if * the IO is valid (i.e. not if the drive is removed). * * zfs_ereport_post() will also do these checks, but * it can also ratelimit and have other failures, so we * need to increment the slow_io counters independent * of it. */ if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_slow_ios++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); } } } if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd) && !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { mutex_enter(&zio->io_vd->vdev_stat_lock); if (zio->io_type == ZIO_TYPE_READ) zio->io_vd->vdev_stat.vs_read_errors++; else if (zio->io_type == ZIO_TYPE_WRITE) zio->io_vd->vdev_stat.vs_write_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); } } if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark, BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } if (zio->io_error && zio == zio->io_logical) { /* * A DDT child tried to create a mixed gang/non-gang BP. We're * going to have to just retry as a non-dedup IO. */ if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) && zio->io_prop.zp_dedup) { zio->io_reexecute |= ZIO_REEXECUTE_NOW; zio->io_prop.zp_dedup = B_FALSE; } /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL) && !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; if (zio->io_reexecute) { /* * A Direct I/O operation that has a checksum verify error * should not attempt to reexecute. Instead, the error should * just be propagated back. */ ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)); /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); /* * This is a rare code path, so we don't * bother with "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; /* * This is a rare code path, so we don't bother with * "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ spa_taskq_dispatch(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, zio_reexecute, zio, B_FALSE); } return (NULL); } ASSERT(list_is_empty(&zio->io_child_list)); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * We are done executing this zio. We may want to execute a parent * next. See the comment in zio_notify_parent(). */ zio_t *next_to_execute = NULL; zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (next_to_execute); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_write_bp_init, zio_free_bp_init, zio_issue_async, zio_write_compress, zio_encrypt, zio_checksum_generate, zio_nop_write, zio_brt_free, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_dio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT); IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT0(last_block->zb_level); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } /* * This function is similar to zbookmark_subtree_completed(), but returns true * if subtree_root is equal or ahead of last_block, i.e. still to be done. */ boolean_t zbookmark_subtree_tbd(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { ASSERT0(last_block->zb_level); if (dnp == NULL) return (B_FALSE); return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root, last_block) >= 0); } EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW, "Prioritize requeued I/O"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, UINT, ZMOD_RW, "Defer frees starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW, "Don't compress starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW, "Rewrite new bps starting in this pass"); ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, "Throttle block allocations in the ZIO pipeline"); ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, "Log all slow ZIOs, not just those with vdevs");