Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c (revision 344935) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c (revision 344936) @@ -1,4216 +1,4216 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, force_ganging, CTLFLAG_RWTUN, &metaslab_force_ganging, 0, "Force gang block allocation for blocks larger than or equal to this value"); /* * Since we can touch multiple metaslabs (and their respective space maps) * with each transaction group, we benefit from having a smaller space map * block size since it allows us to issue more I/O operations scattered * around the disk. */ int zfs_metaslab_sm_blksz = (1 << 12); SYSCTL_INT(_vfs_zfs, OID_AUTO, metaslab_sm_blksz, CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz, 0, "Block size for metaslab DTL space map. Power of 2 and greater than 4096."); /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ int zfs_condense_pct = 200; SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, &zfs_condense_pct, 0, "Condense on-disk spacemap when it is more than this many percents" " of in-memory counterpart"); /* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksize), so a metaslab might use the * same number of blocks after condensing. Since the goal of condensing is to * reduce the number of IOPs required to read the space map, we only want to * condense when we can be sure we will reduce the number of blocks used by the * space map. Unfortunately, we cannot precisely compute whether or not this is * the case in metaslab_should_condense since we are holding ms_lock. Instead, * we apply the following heuristic: do not condense a spacemap unless the * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold * blocks. */ int zfs_metaslab_condense_block_threshold = 4; /* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of * free space. Metaslab groups that have more free space than * zfs_mg_noalloc_threshold are always eligible for allocations. Once * a metaslab group's free space is less than or equal to the * zfs_mg_noalloc_threshold the allocator will avoid allocating to that * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. * Once all groups in the pool reach zfs_mg_noalloc_threshold then all * groups are allowed to accept allocations. Gang blocks are always * eligible to allocate on any metaslab group. The default value of 0 means * no metaslab group will be excluded based on this criterion. */ int zfs_mg_noalloc_threshold = 0; SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, &zfs_mg_noalloc_threshold, 0, "Percentage of metaslab group size that should be free" " to make it eligible for allocation"); /* * Metaslab groups are considered eligible for allocations if their * fragmenation metric (measured as a percentage) is less than or equal to * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold * then it will be skipped unless all metaslab groups within the metaslab * class have also crossed this threshold. */ int zfs_mg_fragmentation_threshold = 85; SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, &zfs_mg_fragmentation_threshold, 0, "Percentage of metaslab group size that should be considered " "eligible for allocations unless all metaslab groups within the metaslab class " "have also crossed this threshold"); /* * Allow metaslabs to keep their active state as long as their fragmentation * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An * active metaslab that exceeds this threshold will no longer keep its active * status allowing better metaslabs to be selected. */ int zfs_metaslab_fragmentation_threshold = 70; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, &zfs_metaslab_fragmentation_threshold, 0, "Maximum percentage of metaslab fragmentation level to keep their active state"); /* * When set will load all metaslabs when pool is first opened. */ int metaslab_debug_load = 0; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, &metaslab_debug_load, 0, "Load all metaslabs when pool is first opened"); /* * When set will prevent metaslabs from being unloaded. */ int metaslab_debug_unload = 0; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, &metaslab_debug_unload, 0, "Prevent metaslabs from being unloaded"); /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, &metaslab_df_alloc_threshold, 0, "Minimum size which forces the dynamic allocator to change it's allocation strategy"); /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ int metaslab_df_free_pct = 4; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, &metaslab_df_free_pct, 0, "The minimum free space, in percent, which must be available in a " "space map to continue allocations in a first-fit fashion"); /* * A metaslab is considered "free" if it contains a contiguous * segment which is greater than metaslab_min_alloc_size. */ uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, &metaslab_min_alloc_size, 0, "A metaslab is considered \"free\" if it contains a contiguous " "segment which is greater than vfs.zfs.metaslab.min_alloc_size"); /* * Percentage of all cpus that can be used by the metaslab taskq. */ int metaslab_load_pct = 50; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, &metaslab_load_pct, 0, "Percentage of cpus that can be used by the metaslab taskq"); /* * Determines how many txgs a metaslab may remain loaded without having any * allocations from it. As long as a metaslab continues to be used we will * keep it loaded. */ int metaslab_unload_delay = TXG_SIZE * 2; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, &metaslab_unload_delay, 0, "Number of TXGs that an unused metaslab can be kept in memory"); /* * Max number of metaslabs per group to preload. */ int metaslab_preload_limit = SPA_DVAS_PER_BP; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, &metaslab_preload_limit, 0, "Max number of metaslabs per group to preload"); /* * Enable/disable preloading of metaslab. */ boolean_t metaslab_preload_enabled = B_TRUE; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, &metaslab_preload_enabled, 0, "Max number of metaslabs per group to preload"); /* * Enable/disable fragmentation weighting on metaslabs. */ boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, &metaslab_fragmentation_factor_enabled, 0, "Enable fragmentation weighting on metaslabs"); /* * Enable/disable lba weighting (i.e. outer tracks are given preference). */ boolean_t metaslab_lba_weighting_enabled = B_TRUE; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, &metaslab_lba_weighting_enabled, 0, "Enable LBA weighting (i.e. outer tracks are given preference)"); /* * Enable/disable metaslab group biasing. */ boolean_t metaslab_bias_enabled = B_TRUE; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, &metaslab_bias_enabled, 0, "Enable metaslab group biasing"); /* * Enable/disable remapping of indirect DVAs to their concrete vdevs. */ boolean_t zfs_remap_blkptr_enable = B_TRUE; /* * Enable/disable segment-based metaslab selection. */ boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; /* * When using segment-based metaslab selection, we will continue * allocating from the active metaslab until we have exhausted * zfs_metaslab_switch_threshold of its buckets. */ int zfs_metaslab_switch_threshold = 2; /* * Internal switch to enable/disable the metaslab allocation tracing * facility. */ boolean_t metaslab_trace_enabled = B_TRUE; /* * Maximum entries that the metaslab allocation tracing facility will keep * in a given list when running in non-debug mode. We limit the number * of entries in non-debug mode to prevent us from using up too much memory. * The limit should be sufficiently large that we don't expect any allocation * to every exceed this value. In debug mode, the system will panic if this * limit is ever reached allowing for further investigation. */ uint64_t metaslab_trace_max_entries = 5000; static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); kmem_cache_t *metaslab_alloc_trace_cache; /* * ========================================================================== * Metaslab classes * ========================================================================== */ metaslab_class_t * metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) { metaslab_class_t *mc; mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); mc->mc_spa = spa; mc->mc_rotor = NULL; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * sizeof (refcount_t), KM_SLEEP); mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * sizeof (uint64_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) refcount_create_tracked(&mc->mc_alloc_slots[i]); return (mc); } void metaslab_class_destroy(metaslab_class_t *mc) { ASSERT(mc->mc_rotor == NULL); ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) refcount_destroy(&mc->mc_alloc_slots[i]); kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * sizeof (refcount_t)); kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * sizeof (uint64_t)); mutex_destroy(&mc->mc_lock); kmem_free(mc, sizeof (metaslab_class_t)); } int metaslab_class_validate(metaslab_class_t *mc) { metaslab_group_t *mg; vdev_t *vd; /* * Must hold one of the spa_config locks. */ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); if ((mg = mc->mc_rotor) == NULL) return (0); do { vd = mg->mg_vd; ASSERT(vd->vdev_mg != NULL); ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); } while ((mg = mg->mg_next) != mc->mc_rotor); return (0); } void metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { atomic_add_64(&mc->mc_alloc, alloc_delta); atomic_add_64(&mc->mc_deferred, defer_delta); atomic_add_64(&mc->mc_space, space_delta); atomic_add_64(&mc->mc_dspace, dspace_delta); } void metaslab_class_minblocksize_update(metaslab_class_t *mc) { metaslab_group_t *mg; vdev_t *vd; uint64_t minashift = UINT64_MAX; if ((mg = mc->mc_rotor) == NULL) { mc->mc_minblocksize = SPA_MINBLOCKSIZE; return; } do { vd = mg->mg_vd; if (vd->vdev_ashift < minashift) minashift = vd->vdev_ashift; } while ((mg = mg->mg_next) != mc->mc_rotor); mc->mc_minblocksize = 1ULL << minashift; } uint64_t metaslab_class_get_alloc(metaslab_class_t *mc) { return (mc->mc_alloc); } uint64_t metaslab_class_get_deferred(metaslab_class_t *mc) { return (mc->mc_deferred); } uint64_t metaslab_class_get_space(metaslab_class_t *mc) { return (mc->mc_space); } uint64_t metaslab_class_get_dspace(metaslab_class_t *mc) { return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc) { return (mc->mc_minblocksize); } void metaslab_class_histogram_verify(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t *mc_hist; int i; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; /* * Skip any holes, uninitialized top-levels, or * vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } /* * Calculate the metaslab class's fragmentation metric. The metric * is weighted based on the space contribution of each metaslab group. * The return value will be a number between 0 and 100 (inclusive), or * ZFS_FRAG_INVALID if the metric has not been set. See comment above the * zfs_frag_table for more information about the metric. */ uint64_t metaslab_class_fragmentation(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t fragmentation = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; /* * Skip any holes, uninitialized top-levels, * or vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * If a metaslab group does not contain a fragmentation * metric then just bail out. */ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (ZFS_FRAG_INVALID); } /* * Determine how much this metaslab_group is contributing * to the overall pool fragmentation metric. */ fragmentation += mg->mg_fragmentation * metaslab_group_get_space(mg); } fragmentation /= metaslab_class_get_space(mc); ASSERT3U(fragmentation, <=, 100); spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (fragmentation); } /* * Calculate the amount of expandable space that is available in * this metaslab class. If a device is expanded then its expandable * space will be the amount of allocatable space that is currently not * part of this metaslab class. */ uint64_t metaslab_class_expandable_space(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t space = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { uint64_t tspace; vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * Calculate if we have enough space to add additional * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. * Adjust by efi system partition size. */ tspace = tvd->vdev_max_asize - tvd->vdev_asize; if (tspace > mc->mc_spa->spa_bootsize) { tspace -= mc->mc_spa->spa_bootsize; } space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); } static int metaslab_compare(const void *x1, const void *x2) { const metaslab_t *m1 = (const metaslab_t *)x1; const metaslab_t *m2 = (const metaslab_t *)x2; int sort1 = 0; int sort2 = 0; if (m1->ms_allocator != -1 && m1->ms_primary) sort1 = 1; else if (m1->ms_allocator != -1 && !m1->ms_primary) sort1 = 2; if (m2->ms_allocator != -1 && m2->ms_primary) sort2 = 1; else if (m2->ms_allocator != -1 && !m2->ms_primary) sort2 = 2; /* * Sort inactive metaslabs first, then primaries, then secondaries. When * selecting a metaslab to allocate from, an allocator first tries its * primary, then secondary active metaslab. If it doesn't have active * metaslabs, or can't allocate from them, it searches for an inactive * metaslab to activate. If it can't find a suitable one, it will steal * a primary or secondary metaslab from another allocator. */ if (sort1 < sort2) return (-1); if (sort1 > sort2) return (1); int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); return (AVL_CMP(m1->ms_start, m2->ms_start)); } /* * Verify that the space accounting on disk matches the in-core range_trees. */ void metaslab_verify_space(metaslab_t *msp, uint64_t txg) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t allocated = 0; uint64_t sm_free_space, msp_free_space; ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can only verify the metaslab space when we're called * from syncing context with a loaded metaslab that has an allocated * space map. Calling this in non-syncing context does not * provide a consistent view of the metaslab since we're performing * allocations in the future. */ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || !msp->ms_loaded) return; sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - space_map_alloc_delta(msp->ms_sm); /* * Account for future allocations since we would have already * deducted that space from the ms_freetree. */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { allocated += range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + msp->ms_deferspace + range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); } /* * ========================================================================== * Metaslab groups * ========================================================================== */ /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below * the zfs_mg_noalloc_threshold or has a fragmentation value that is * greater than zfs_mg_fragmentation_threshold. If a metaslab group * transitions from allocatable to non-allocatable or vice versa then the * metaslab group's class is updated to reflect the transition. */ static void metaslab_group_alloc_update(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; metaslab_class_t *mc = mg->mg_class; vdev_stat_t *vs = &vd->vdev_stat; boolean_t was_allocatable; boolean_t was_initialized; ASSERT(vd == vd->vdev_top); ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, SCL_ALLOC); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; was_initialized = mg->mg_initialized; mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); mutex_enter(&mc->mc_lock); /* * If the metaslab group was just added then it won't * have any space until we finish syncing out this txg. * At that point we will consider it initialized and available * for allocations. We also don't consider non-activated * metaslab groups (e.g. vdevs that are in the middle of being removed) * to be initialized, because they can't be used for allocation. */ mg->mg_initialized = metaslab_group_initialized(mg); if (!was_initialized && mg->mg_initialized) { mc->mc_groups++; } else if (was_initialized && !mg->mg_initialized) { ASSERT3U(mc->mc_groups, >, 0); mc->mc_groups--; } if (mg->mg_initialized) mg->mg_no_free_space = B_FALSE; /* * A metaslab group is considered allocatable if it has plenty * of free space or is not heavily fragmented. We only take * fragmentation into account if the metaslab group has a valid * fragmentation metric (i.e. a value between 0 and 100). */ mg->mg_allocatable = (mg->mg_activation_count > 0 && mg->mg_free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); /* * The mc_alloc_groups maintains a count of the number of * groups in this metaslab class that are still above the * zfs_mg_noalloc_threshold. This is used by the allocating * threads to determine if they should avoid allocations to * a given group. The allocator will avoid allocations to a group * if that group has reached or is below the zfs_mg_noalloc_threshold * and there are still other groups that are above the threshold. * When a group transitions from allocatable to non-allocatable or * vice versa we update the metaslab class to reflect that change. * When the mc_alloc_groups value drops to 0 that means that all * groups have reached the zfs_mg_noalloc_threshold making all groups * eligible for allocations. This effectively means that all devices * are balanced again. */ if (was_allocatable && !mg->mg_allocatable) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), KM_SLEEP); mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), KM_SLEEP); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; mg->mg_initialized = B_FALSE; mg->mg_no_free_space = B_TRUE; mg->mg_allocators = allocators; mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t), KM_SLEEP); mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * sizeof (uint64_t), KM_SLEEP); for (int i = 0; i < allocators; i++) { refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); mg->mg_cur_max_alloc_queue_depth[i] = 0; } mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); return (mg); } void metaslab_group_destroy(metaslab_group_t *mg) { ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); /* * We may have gone below zero with the activation count * either because we never activated in the first place or * because we're done, and possibly removing the vdev. */ ASSERT(mg->mg_activation_count <= 0); taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); kmem_free(mg->mg_secondaries, mg->mg_allocators * sizeof (metaslab_t *)); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_initialize_lock); cv_destroy(&mg->mg_ms_initialize_cv); for (int i = 0; i < mg->mg_allocators; i++) { refcount_destroy(&mg->mg_alloc_queue_depth[i]); mg->mg_cur_max_alloc_queue_depth[i] = 0; } kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * sizeof (refcount_t)); kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * sizeof (uint64_t)); kmem_free(mg, sizeof (metaslab_group_t)); } void metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; metaslab_group_t *mgprev, *mgnext; ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); ASSERT(mc->mc_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count <= 0); if (++mg->mg_activation_count <= 0) return; mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else { mgnext = mgprev->mg_next; mg->mg_prev = mgprev; mg->mg_next = mgnext; mgprev->mg_next = mg; mgnext->mg_prev = mg; } mc->mc_rotor = mg; metaslab_class_minblocksize_update(mc); } /* * Passivate a metaslab group and remove it from the allocation rotor. * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating * a metaslab group. This function will momentarily drop spa_config_locks * that are lower than the SCL_ALLOC lock (see comment below). */ void metaslab_group_passivate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { ASSERT(mc->mc_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count < 0); return; } /* * The spa_config_lock is an array of rwlocks, ordered as * follows (from highest to lowest): * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > * SCL_ZIO > SCL_FREE > SCL_VDEV * (For more information about the spa_config_lock see spa_misc.c) * The higher the lock, the broader its coverage. When we passivate * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO * config locks. However, the metaslab group's taskq might be trying * to preload metaslabs so we must drop the SCL_ZIO lock and any * lower locks to allow the I/O to complete. At a minimum, * we continue to hold the SCL_ALLOC lock, which prevents any future * allocations from taking place and any changes to the vdev tree. */ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); taskq_wait(mg->mg_taskq); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < mg->mg_allocators; i++) { metaslab_t *msp = mg->mg_primaries[i]; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } msp = mg->mg_secondaries[i]; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } } mgprev = mg->mg_prev; mgnext = mg->mg_next; if (mg == mgnext) { mc->mc_rotor = NULL; } else { mc->mc_rotor = mgnext; mgprev->mg_next = mgnext; mgnext->mg_prev = mgprev; } mg->mg_prev = NULL; mg->mg_next = NULL; metaslab_class_minblocksize_update(mc); } boolean_t metaslab_group_initialized(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; vdev_stat_t *vs = &vd->vdev_stat; return (vs->vs_space != 0 && mg->mg_activation_count > 0); } uint64_t metaslab_group_get_space(metaslab_group_t *mg) { return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); } void metaslab_group_histogram_verify(metaslab_group_t *mg) { uint64_t *mg_hist; vdev_t *vd = mg->mg_vd; uint64_t ashift = vd->vdev_ashift; int i; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp->ms_sm == NULL) continue; for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) mg_hist[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } static void metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mg->mg_lock); } void metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { ASSERT3U(mg->mg_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mg->mg_lock); } static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { ASSERT(msp->ms_group == NULL); mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); mutex_enter(&msp->ms_lock); metaslab_group_histogram_add(mg, msp); mutex_exit(&msp->ms_lock); } static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { mutex_enter(&msp->ms_lock); metaslab_group_histogram_remove(mg, msp); mutex_exit(&msp->ms_lock); mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_group = NULL; mutex_exit(&mg->mg_lock); } static void metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&mg->mg_lock)); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_weight = weight; avl_add(&mg->mg_metaslab_tree, msp); } static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { /* * Although in principle the weight can be any value, in * practice we do not use values in the range [1, 511]. */ ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } /* * Calculate the fragmentation for a given metaslab group. We can use * a simple average here since all metaslabs within the group must have * the same size. The return value will be a value between 0 and 100 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this * group have a fragmentation metric. */ uint64_t metaslab_group_fragmentation(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; uint64_t fragmentation = 0; uint64_t valid_ms = 0; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp->ms_fragmentation == ZFS_FRAG_INVALID) continue; valid_ms++; fragmentation += msp->ms_fragmentation; } if (valid_ms <= vd->vdev_ms_count / 2) return (ZFS_FRAG_INVALID); fragmentation /= valid_ms; ASSERT3U(fragmentation, <=, 100); return (fragmentation); } /* * Determine if a given metaslab group should skip allocations. A metaslab * group should avoid allocations if its free capacity is less than the * zfs_mg_noalloc_threshold or its fragmentation metric is greater than * zfs_mg_fragmentation_threshold and there is at least one metaslab group * that can still handle allocations. If the allocation throttle is enabled * then we skip allocations to devices that have reached their maximum * allocation queue depth unless the selected metaslab group is the only * eligible group remaining. */ static boolean_t metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, uint64_t psize, int allocator, int d) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; /* * We can only consider skipping this metaslab group if it's * in the normal metaslab class and there are other metaslab * groups to select from. Otherwise, we always consider it eligible * for allocations. */ if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) return (B_TRUE); /* * If the metaslab group's mg_allocatable flag is set (see comments * in metaslab_group_alloc_update() for more information) and * the allocation throttle is disabled then allow allocations to this * device. However, if the allocation throttle is enabled then * check if we have reached our allocation limit (mg_alloc_queue_depth) * to determine if we should allow allocations to this metaslab group. * If all metaslab groups are no longer considered allocatable * (mc_alloc_groups == 0) or we're trying to allocate the smallest * gang block size then we allow allocations on this metaslab group * regardless of the mg_allocatable or throttle settings. */ if (mg->mg_allocatable) { metaslab_group_t *mgp; int64_t qdepth; uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; if (!mc->mc_alloc_throttle_enabled) return (B_TRUE); /* * If this metaslab group does not have any free space, then * there is no point in looking further. */ if (mg->mg_no_free_space) return (B_FALSE); /* * Relax allocation throttling for ditto blocks. Due to * random imbalances in allocation it tends to push copies * to one vdev, that looks a bit better at the moment. */ qmax = qmax * (4 + d) / 4; qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]); /* * If this metaslab group is below its qmax or it's * the only allocatable metasable group, then attempt * to allocate from it. */ if (qdepth < qmax || mc->mc_alloc_groups == 1) return (B_TRUE); ASSERT3U(mc->mc_alloc_groups, >, 1); /* * Since this metaslab group is at or over its qmax, we * need to determine if there are metaslab groups after this * one that might be able to handle this allocation. This is * racy since we can't hold the locks for all metaslab * groups at the same time when we make this check. */ for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; qmax = qmax * (4 + d) / 4; qdepth = refcount_count( &mgp->mg_alloc_queue_depth[allocator]); /* * If there is another metaslab group that * might be able to handle the allocation, then * we return false so that we skip this group. */ if (qdepth < qmax && !mgp->mg_no_free_space) return (B_FALSE); } /* * We didn't find another group to handle the allocation * so we can't skip this metaslab group even though * we are at or over our qmax. */ return (B_TRUE); } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { return (B_TRUE); } return (B_FALSE); } /* * ========================================================================== * Range tree callbacks * ========================================================================== */ /* * Comparison function for the private size-ordered tree. Tree is sorted * by size, larger sizes at the end of the tree. */ static int metaslab_rangesize_compare(const void *x1, const void *x2) { const range_seg_t *r1 = x1; const range_seg_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = AVL_CMP(rs_size1, rs_size2); if (likely(cmp)) return (cmp); if (r1->rs_start < r2->rs_start) return (-1); return (AVL_CMP(r1->rs_start, r2->rs_start)); } /* * ========================================================================== * Common allocator routines * ========================================================================== */ /* * Return the maximum contiguous segment within the metaslab. */ uint64_t metaslab_block_maxsize(metaslab_t *msp) { avl_tree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; if (t == NULL || (rs = avl_last(t)) == NULL) return (0ULL); return (rs->rs_end - rs->rs_start); } static range_seg_t * metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) { range_seg_t *rs, rsearch; avl_index_t where; rsearch.rs_start = start; rsearch.rs_end = start + size; rs = avl_find(t, &rsearch, &where); if (rs == NULL) { rs = avl_nearest(t, where, AVL_AFTER); } return (rs); } /* * This is a helper function that can be used by the allocator to find * a suitable block to allocate. This will search the specified AVL * tree looking for a block that matches the specified criteria. */ static uint64_t metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, uint64_t align) { range_seg_t *rs = metaslab_block_find(t, *cursor, size); while (rs != NULL) { uint64_t offset = P2ROUNDUP(rs->rs_start, align); if (offset + size <= rs->rs_end) { *cursor = offset + size; return (offset); } rs = AVL_NEXT(t, rs); } /* * If we know we've searched the whole map (*cursor == 0), give up. * Otherwise, reset the cursor to the beginning and try again. */ if (*cursor == 0) return (-1ULL); *cursor = 0; return (metaslab_block_picker(t, cursor, size, align)); } /* * ========================================================================== * The first-fit block allocator * ========================================================================== */ static uint64_t metaslab_ff_alloc(metaslab_t *msp, uint64_t size) { /* * Find the largest power of 2 block size that evenly divides the * requested size. This is used to try to allocate blocks with similar * alignment from the same area of the metaslab (i.e. same cursor * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; avl_tree_t *t = &msp->ms_allocatable->rt_root; return (metaslab_block_picker(t, cursor, size, align)); } static metaslab_ops_t metaslab_ff_ops = { metaslab_ff_alloc }; /* * ========================================================================== * Dynamic block allocator - * Uses the first fit allocation scheme until space get low and then * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold * and metaslab_df_free_pct to determine when to switch the allocation scheme. * ========================================================================== */ static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size) { /* * Find the largest power of 2 block size that evenly divides the * requested size. This is used to try to allocate blocks with similar * alignment from the same area of the metaslab (i.e. same cursor * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; range_tree_t *rt = msp->ms_allocatable; avl_tree_t *t = &rt->rt_root; uint64_t max_size = metaslab_block_maxsize(msp); int free_pct = range_tree_space(rt) * 100 / msp->ms_size; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); /* * If we're running low on space switch to using the size * sorted AVL tree (best-fit). */ if (max_size < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { t = &msp->ms_allocatable_by_size; *cursor = 0; } return (metaslab_block_picker(t, cursor, size, 1ULL)); } static metaslab_ops_t metaslab_df_ops = { metaslab_df_alloc }; /* * ========================================================================== * Cursor fit block allocator - * Select the largest region in the metaslab, set the cursor to the beginning * of the range and the cursor_end to the end of the range. As allocations * are made advance the cursor. Continue allocating from the cursor until * the range is exhausted and then find a new range. * ========================================================================== */ static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { range_tree_t *rt = msp->ms_allocatable; avl_tree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { range_seg_t *rs; rs = avl_last(&msp->ms_allocatable_by_size); if (rs == NULL || (rs->rs_end - rs->rs_start) < size) return (-1ULL); *cursor = rs->rs_start; *cursor_end = rs->rs_end; } offset = *cursor; *cursor += size; return (offset); } static metaslab_ops_t metaslab_cf_ops = { metaslab_cf_alloc }; /* * ========================================================================== * New dynamic fit allocator - * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift * contiguous blocks. If no region is found then just use the largest segment * that remains. * ========================================================================== */ /* * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) * to request from the allocator. */ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { avl_tree_t *t = &msp->ms_allocatable->rt_root; avl_index_t where; range_seg_t *rs, rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_block_maxsize(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); rsearch.rs_start = *cursor; rsearch.rs_end = *cursor + size; rs = avl_find(t, &rsearch, &where); if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { t = &msp->ms_allocatable_by_size; rsearch.rs_start = 0; rsearch.rs_end = MIN(max_size, 1ULL << (hbit + metaslab_ndf_clump_shift)); rs = avl_find(t, &rsearch, &where); if (rs == NULL) rs = avl_nearest(t, where, AVL_AFTER); ASSERT(rs != NULL); } if ((rs->rs_end - rs->rs_start) >= size) { *cursor = rs->rs_start + size; return (rs->rs_start); } return (-1ULL); } static metaslab_ops_t metaslab_ndf_ops = { metaslab_ndf_alloc }; metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; /* * ========================================================================== * Metaslabs * ========================================================================== */ /* * Wait for any in-progress metaslab loads to complete. */ void metaslab_load_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_loading) { ASSERT(!msp->ms_loaded); cv_wait(&msp->ms_load_cv, &msp->ms_lock); } } int metaslab_load(metaslab_t *msp) { int error = 0; boolean_t success = B_FALSE; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!msp->ms_loaded); ASSERT(!msp->ms_loading); msp->ms_loading = B_TRUE; /* * Nobody else can manipulate a loading metaslab, so it's now safe * to drop the lock. This way we don't have to hold the lock while * reading the spacemap from disk. */ mutex_exit(&msp->ms_lock); /* * If the space map has not been allocated yet, then treat * all the space in the metaslab as free and add it to ms_allocatable. */ if (msp->ms_sm != NULL) { error = space_map_load(msp->ms_sm, msp->ms_allocatable, SM_FREE); } else { range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); } success = (error == 0); mutex_enter(&msp->ms_lock); msp->ms_loading = B_FALSE; if (success) { ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; /* * If the metaslab already has a spacemap, then we need to * remove all segments from the defer tree; otherwise, the * metaslab is completely empty and we can skip this. */ if (msp->ms_sm != NULL) { for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_remove, msp->ms_allocatable); } } msp->ms_max_size = metaslab_block_maxsize(msp); } cv_broadcast(&msp->ms_load_cv); return (error); } void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; msp->ms_max_size = 0; } int metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; objset_t *mos = vd->vdev_spa->spa_meta_objset; metaslab_t *ms; int error; ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; ms->ms_allocator = -1; ms->ms_new = B_TRUE; /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. */ if (object != 0) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, ms->ms_size, vd->vdev_ashift); if (error != 0) { kmem_free(ms, sizeof (metaslab_t)); return (error); } ASSERT(ms->ms_sm != NULL); } /* * We create the main range tree here, but we don't create the * other range trees until metaslab_sync_done(). This serves * two purposes: it allows metaslab_sync_done() to detect the * addition of new space; and for debugging, it ensures that we'd * data fault on any attempt to use this metaslab before it's ready. */ ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms); /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. * If we're adding space to an existing pool, the new space * does not become available until after this txg has synced. * The metaslab's weight will also be initialized when we sync * out this txg. This ensures that we don't attempt to allocate * from it before we have initialized it completely. */ if (txg <= TXG_INITIAL) metaslab_sync_done(ms, 0); /* * If metaslab_debug_load is set and we're initializing a metaslab * that has an allocated space map object then load the its space * map so that can verify frees. */ if (metaslab_debug_load && ms->ms_sm != NULL) { mutex_enter(&ms->ms_lock); VERIFY0(metaslab_load(ms)); mutex_exit(&ms->ms_lock); } if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); vdev_dirty(vd, VDD_METASLAB, ms, txg); } *msp = ms; return (0); } void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 0, -msp->ms_size); space_map_close(msp->ms_sm); metaslab_unload(msp); range_tree_destroy(msp->ms_allocatable); range_tree_destroy(msp->ms_freeing); range_tree_destroy(msp->ms_freed); for (int t = 0; t < TXG_SIZE; t++) { range_tree_destroy(msp->ms_allocating[t]); } for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); range_tree_destroy(msp->ms_checkpointing); mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); kmem_free(msp, sizeof (metaslab_t)); } #define FRAGMENTATION_TABLE_SIZE 17 /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done * by calculating the space in each bucket of the spacemap histogram and * multiplying that by the fragmetation metric in this table. Doing * this for all buckets and dividing it by the total amount of free * space in this metaslab (i.e. the total free space in all buckets) gives * us the fragmentation metric. This means that a high fragmentation metric * equates to most of the free space being comprised of small segments. * Conversely, if the metric is low, then most of the free space is in * large segments. A 10% change in fragmentation equates to approximately * double the number of segments. * * This table defines 0% fragmented space using 16MB segments. Testing has * shown that segments that are greater than or equal to 16MB do not suffer * from drastic performance problems. Using this value, we derive the rest * of the table. Since the fragmentation value is never stored on disk, it * is possible to change these calculations in the future. */ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 100, /* 512B */ 100, /* 1K */ 98, /* 2K */ 95, /* 4K */ 90, /* 8K */ 80, /* 16K */ 70, /* 32K */ 60, /* 64K */ 50, /* 128K */ 40, /* 256K */ 30, /* 512K */ 20, /* 1M */ 15, /* 2M */ 10, /* 4M */ 5, /* 8M */ 0 /* 16M */ }; /* * Calclate the metaslab's fragmentation metric. A return value * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does * not support this metric. Otherwise, the return value should be in the * range [0, 100]. */ static void metaslab_set_fragmentation(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t fragmentation = 0; uint64_t total = 0; boolean_t feature_enabled = spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM); if (!feature_enabled) { msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } /* * A null space map means that the entire metaslab is free * and thus is not fragmented. */ if (msp->ms_sm == NULL) { msp->ms_fragmentation = 0; return; } /* * If this metaslab's space map has not been upgraded, flag it * so that we upgrade next time we encounter it. */ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { uint64_t txg = spa_syncing_txg(spa); vdev_t *vd = msp->ms_group->mg_vd; /* * If we've reached the final dirty txg, then we must * be shutting down the pool. We don't want to dirty * any data past this point so skip setting the condense * flag. We can retry this action the next time the pool * is imported. */ if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { msp->ms_condense_wanted = B_TRUE; vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); zfs_dbgmsg("txg %llu, requesting force condense: " "ms_id %llu, vdev_id %llu", txg, msp->ms_id, vd->vdev_id); } msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { uint64_t space = 0; uint8_t shift = msp->ms_sm->sm_shift; int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, FRAGMENTATION_TABLE_SIZE - 1); if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) continue; space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); total += space; ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); fragmentation += space * zfs_frag_table[idx]; } if (total > 0) fragmentation /= total; ASSERT3U(fragmentation, <=, 100); msp->ms_fragmentation = fragmentation; } /* * Compute a weight -- a selection preference value -- for the given metaslab. * This is based on the amount of free space, the level of fragmentation, * the LBA range, and whether the metaslab is loaded. */ static uint64_t metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!vd->vdev_removing); /* * The baseline weight is the metaslab's free space. */ space = msp->ms_size - space_map_allocated(msp->ms_sm); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { /* * Use the fragmentation information to inversely scale * down the baseline weight. We need to ensure that we * don't exclude this metaslab completely when it's 100% * fragmented. To avoid this we reduce the fragmented value * by 1. */ space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; /* * If space < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. The fragmentation metric may have * decreased the space to something smaller than * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE * so that we can consume any remaining space. */ if (space > 0 && space < SPA_MINBLOCKSIZE) space = SPA_MINBLOCKSIZE; } weight = space; /* * Modern disks have uniform bit density and constant angular velocity. * Therefore, the outer recording zones are faster (higher bandwidth) * than the inner zones by the ratio of outer to inner track diameter, * which is typically around 2:1. We account for this by assigning * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ - if (metaslab_lba_weighting_enabled) { + if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } /* * If this metaslab is one we're actively using, adjust its * weight to make it preferable to any inactive metaslab so * we'll polish it off. If the fragmentation on this metaslab * has exceed our threshold, then don't mark it active. */ if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } WEIGHT_SET_SPACEBASED(weight); return (weight); } /* * Return the weight of the specified metaslab, according to the segment-based * weighting algorithm. The metaslab must be loaded. This function can * be called within a sync pass since it relies only on the metaslab's * range tree which is always accurate when the metaslab is loaded. */ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp) { uint64_t weight = 0; uint32_t segments = 0; ASSERT(msp->ms_loaded); for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; i--) { uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; segments <<= 1; segments += msp->ms_allocatable->rt_histogram[i]; /* * The range tree provides more precision than the space map * and must be downgraded so that all values fit within the * space map's histogram. This allows us to compare loaded * vs. unloaded metaslabs to determine which metaslab is * considered "best". */ if (i > max_idx) continue; if (segments != 0) { WEIGHT_SET_COUNT(weight, segments); WEIGHT_SET_INDEX(weight, i); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Calculate the weight based on the on-disk histogram. This should only * be called after a sync pass has completely finished since the on-disk * information is updated in metaslab_sync(). */ static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) { uint64_t weight = 0; for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { WEIGHT_SET_COUNT(weight, msp->ms_sm->sm_phys->smp_histogram[i]); WEIGHT_SET_INDEX(weight, i + msp->ms_sm->sm_shift); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Compute a segment-based weight for the specified metaslab. The weight * is determined by highest bucket in the histogram. The information * for the highest bucket is encoded into the weight value. */ static uint64_t metaslab_segment_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; uint64_t weight = 0; uint8_t shift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The metaslab is completely free. */ if (space_map_allocated(msp->ms_sm) == 0) { int idx = highbit64(msp->ms_size) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; if (idx < max_idx) { WEIGHT_SET_COUNT(weight, 1ULL); WEIGHT_SET_INDEX(weight, idx); } else { WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); WEIGHT_SET_INDEX(weight, max_idx); } WEIGHT_SET_ACTIVE(weight, 0); ASSERT(!WEIGHT_IS_SPACEBASED(weight)); return (weight); } ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * If the metaslab is fully allocated then just make the weight 0. */ if (space_map_allocated(msp->ms_sm) == msp->ms_size) return (0); /* * If the metaslab is already loaded, then use the range tree to * determine the weight. Otherwise, we rely on the space map information * to generate the weight. */ if (msp->ms_loaded) { weight = metaslab_weight_from_range_tree(msp); } else { weight = metaslab_weight_from_spacemap(msp); } /* * If the metaslab was active the last time we calculated its weight * then keep it active. We want to consume the entire region that * is associated with this weight. */ if (msp->ms_activation_weight != 0 && weight != 0) WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); return (weight); } /* * Determine if we should attempt to allocate from this metaslab. If the * metaslab has a maximum size then we can quickly determine if the desired * allocation size can be satisfied. Otherwise, if we're using segment-based * weighting then we can determine the maximum allocation that this metaslab * can accommodate based on the index encoded in the weight. If we're using * space-based weights then rely on the entire weight (excluding the weight * type bit). */ boolean_t metaslab_should_allocate(metaslab_t *msp, uint64_t asize) { boolean_t should_allocate; if (msp->ms_max_size != 0) return (msp->ms_max_size >= asize); if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { /* * The metaslab segment weight indicates segments in the * range [2^i, 2^(i+1)), where i is the index in the weight. * Since the asize might be in the middle of the range, we * should attempt the allocation if asize < 2^(i+1). */ should_allocate = (asize < 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); } else { should_allocate = (asize <= (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); } return (should_allocate); } static uint64_t metaslab_weight(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; uint64_t weight; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If this vdev is in the process of being removed, there is nothing * for us to do here. */ if (vd->vdev_removing) return (0); metaslab_set_fragmentation(msp); /* * Update the maximum size if the metaslab is loaded. This will * ensure that we get an accurate maximum size if newly freed space * has been added back into the free tree. */ if (msp->ms_loaded) msp->ms_max_size = metaslab_block_maxsize(msp); /* * Segment-based weighting requires space map histogram support. */ if (zfs_metaslab_segment_weight_enabled && spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == sizeof (space_map_phys_t))) { weight = metaslab_segment_weight(msp); } else { weight = metaslab_space_weight(msp); } return (weight); } static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) { /* * If we're activating for the claim code, we don't want to actually * set the metaslab up for a specific allocator. */ if (activation_weight == METASLAB_WEIGHT_CLAIM) return (0); metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? mg->mg_primaries : mg->mg_secondaries); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); if (arr[allocator] != NULL) { mutex_exit(&mg->mg_lock); return (EEXIST); } arr[allocator] = msp; ASSERT3S(msp->ms_allocator, ==, -1); msp->ms_allocator = allocator; msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); mutex_exit(&mg->mg_lock); return (0); } static int metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { int error = 0; metaslab_load_wait(msp); if (!msp->ms_loaded) { if ((error = metaslab_load(msp)) != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } } if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { /* * The metaslab was activated for another allocator * while we were waiting, we should reselect. */ return (EBUSY); } if ((error = metaslab_activate_allocator(msp->ms_group, msp, allocator, activation_weight)) != 0) { return (error); } msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); return (0); } static void metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { metaslab_group_sort(mg, msp, weight); return; } mutex_enter(&mg->mg_lock); ASSERT3P(msp->ms_group, ==, mg); if (msp->ms_primary) { ASSERT3U(0, <=, msp->ms_allocator); ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); mg->mg_primaries[msp->ms_allocator] = NULL; } else { ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); mg->mg_secondaries[msp->ms_allocator] = NULL; } msp->ms_allocator = -1; metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_is_empty(msp->ms_allocatable)); ASSERT0(weight & METASLAB_ACTIVE_MASK); msp->ms_activation_weight = 0; metaslab_passivate_allocator(msp->ms_group, msp, weight); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } /* * Segment-based metaslabs are activated once and remain active until * we either fail an allocation attempt (similar to space-based metaslabs) * or have exhausted the free space in zfs_metaslab_switch_threshold * buckets since the metaslab was activated. This function checks to see * if we've exhaused the zfs_metaslab_switch_threshold buckets in the * metaslab and passivates it proactively. This will allow us to select a * metaslabs with larger contiguous region if any remaining within this * metaslab group. If we're in sync pass > 1, then we continue using this * metaslab so that we don't dirty more block and cause more sync passes. */ void metaslab_segment_may_passivate(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) return; /* * Since we are in the middle of a sync pass, the most accurate * information that is accessible to us is the in-core range tree * histogram; calculate the new weight based on that information. */ uint64_t weight = metaslab_weight_from_range_tree(msp); int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); int current_idx = WEIGHT_GET_INDEX(weight); if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) metaslab_passivate(msp, weight); } static void metaslab_preload(void *arg) { metaslab_t *msp = arg; spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); metaslab_load_wait(msp); if (!msp->ms_loaded) (void) metaslab_load(msp); msp->ms_selected_txg = spa_syncing_txg(spa); mutex_exit(&msp->ms_lock); } static void metaslab_group_preload(metaslab_group_t *mg) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp; avl_tree_t *t = &mg->mg_metaslab_tree; int m = 0; if (spa_shutting_down(spa) || !metaslab_preload_enabled) { taskq_wait(mg->mg_taskq); return; } mutex_enter(&mg->mg_lock); /* * Load the next potential metaslabs */ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { ASSERT3P(msp->ms_group, ==, mg); /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced * to condense then we preload it too. This will ensure * that force condensing happens in the next txg. */ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { continue; } VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, msp, TQ_SLEEP) != 0); } mutex_exit(&mg->mg_lock); } /* * Determine if the space map's on-disk footprint is past our tolerance * for inefficiency. We would like to use the following criteria to make * our decision: * * 1. The size of the space map object should not dramatically increase as a * result of writing out the free space range tree. * * 2. The minimal on-disk space map representation is zfs_condense_pct/100 * times the size than the free space range tree representation * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). * * 3. The on-disk size of the space map should actually decrease. * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. * Instead, we apply the heuristic described in the block comment for * zfs_metaslab_condense_block_threshold - we only condense if the space used * is greater than a threshold number of blocks. */ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1 << vd->vdev_ashift; uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); /* * Allocations and frees in early passes are generally more space * efficient (in terms of blocks described in space map entries) * than the ones in later passes (e.g. we don't compress after * sync pass 5) and condensing a metaslab multiple times in a txg * could degrade performance. * * Thus we prefer condensing each metaslab at most once every txg at * the earliest sync pass possible. If a metaslab is eligible for * condensing again after being considered for condensing within the * same txg, it will hopefully be dirty in the next txg where it will * be condensed at an earlier pass. */ if (msp->ms_condense_checked_txg == current_txg) return (B_FALSE); msp->ms_condense_checked_txg = current_txg; /* * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ if (avl_is_empty(&msp->ms_allocatable_by_size) || msp->ms_condense_wanted) return (B_TRUE); uint64_t object_size = space_map_length(msp->ms_sm); uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); dmu_object_info_t doi; dmu_object_info_from_db(sm->sm_dbuf, &doi); uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } /* * Condense the on-disk space map representation to its minimized form. * The minimized form consists of a small number of allocations followed by * the entries of the free range tree. */ static void metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) { range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, msp->ms_group->mg_vd->vdev_spa->spa_name, space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_allocatable->rt_root), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; /* * Create an range tree that is 100% allocated. We remove segments * that have been freed in this txg, any deferred frees that exist, * and any allocation in the future. Removing segments should be * a relatively inexpensive operation since we expect these trees to * have a small number of nodes. */ condense_tree = range_tree_create(NULL, NULL); range_tree_add(condense_tree, msp->ms_start, msp->ms_size); range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_remove, condense_tree); } for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], range_tree_remove, condense_tree); } /* * We're about to drop the metaslab's lock thus allowing * other consumers to change it's content. Set the * metaslab's ms_condensing flag to ensure that * allocations on this metaslab do not occur while we're * in the middle of committing it to disk. This is only critical * for ms_allocatable as all other range trees use per txg * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); /* * While we would ideally like to create a space map representation * that consists only of allocation records, doing so can be * prohibitively expensive because the in-core free tree can be * large, and therefore computationally expensive to subtract * from the condense_tree. Instead we sync out two trees, a cheap * allocation only tree followed by the in-core free tree. While not * optimal, this is typically close to optimal, and much cheaper to * compute. */ space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; } /* * Write a metaslab to disk in the context of the specified transaction group. */ void metaslab_sync(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; uint64_t object = space_map_object(msp->ms_sm); ASSERT(!vd->vdev_ishole); /* * This metaslab has just been added so there's no work to do now. */ if (msp->ms_freeing == NULL) { ASSERT3P(alloctree, ==, NULL); return; } ASSERT3P(alloctree, !=, NULL); ASSERT3P(msp->ms_freeing, !=, NULL); ASSERT3P(msp->ms_freed, !=, NULL); ASSERT3P(msp->ms_checkpointing, !=, NULL); /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being * forced to condense and it's loaded, we need to let it through. */ if (range_tree_is_empty(alloctree) && range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing) && !(msp->ms_loaded && msp->ms_condense_wanted)) return; VERIFY(txg <= spa_final_dirty_txg(spa)); /* * The only state that can actually be changing concurrently with * metaslab_sync() is the metaslab's ms_allocatable. No other * thread can be modifying this txg's alloc, freeing, * freed, or space_map_phys_t. We drop ms_lock whenever we * could call into the DMU, because the DMU can call down to us * (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state * concurrently, and it is locked out by the ms_sync_lock. Note * that the ms_lock is insufficient for this, because it is dropped * by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); if (msp->ms_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); ASSERT(msp->ms_sm != NULL); } if (!range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); uint64_t new_object = space_map_alloc(mos, vdev_standard_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * We save the space map object as an entry in vdev_top_zap * so it can be retrieved when the pool is reopened after an * export or through zdb. */ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (new_object), 1, &new_object, tx)); } mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Note: metaslab_condense() clears the space map's histogram. * Therefore we must verify and remove this histogram before * condensing. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); if (msp->ms_loaded && metaslab_should_condense(msp)) { metaslab_condense(msp, txg, tx); } else { mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); } if (!range_tree_is_empty(msp->ms_checkpointing)) { ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the * ms_lock while writing to the checkpoint space map. */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); space_map_update(vd->vdev_checkpoint_sm); spa->spa_checkpoint_info.sci_dspace += range_tree_space(msp->ms_checkpointing); vd->vdev_stat.vs_checkpoint_space += range_tree_space(msp->ms_checkpointing); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, -vd->vdev_checkpoint_sm->sm_alloc); range_tree_vacate(msp->ms_checkpointing, NULL, NULL); } if (msp->ms_loaded) { /* * When the space map is loaded, we have an accurate * histogram in the range tree. This gives us an opportunity * to bring the space map's histogram up-to-date so we clear * it first before updating it. */ space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); /* * Since we've cleared the histogram we need to add back * any free space that has already been processed, plus * any deferred space. This allows the on-disk histogram * to accurately reflect all free space even if some space * is not yet available for allocation (i.e. deferred). */ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); /* * Add back any deferred free space that has not been * added back into the in-core free tree yet. This will * ensure that we don't end up with a space map histogram * that is completely empty unless the metaslab is fully * allocated. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } } /* * Always add the free space from this sync pass to the space * map histogram. We want to make sure that the on-disk histogram * accounts for all free space. If the space map is not loaded, * then we will lose some accuracy but will correct it the next * time we load the space map. */ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); /* * For sync pass 1, we avoid traversing this txg's free range tree * and instead will just swap the pointers for freeing and * freed. We can safely do this since the freed_tree is * guaranteed to be empty on the initial pass. */ if (spa_sync_pass(spa) == 1) { range_tree_swap(&msp->ms_freeing, &msp->ms_freed); } else { range_tree_vacate(msp->ms_freeing, range_tree_add, msp->ms_freed); } range_tree_vacate(alloctree, NULL, NULL); ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); if (object != space_map_object(msp->ms_sm)) { object = space_map_object(msp->ms_sm); dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &object, tx); } mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. */ void metaslab_sync_done(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; boolean_t defer_allowed = B_TRUE; ASSERT(!vd->vdev_ishole); mutex_enter(&msp->ms_lock); /* * If this metaslab is just becoming available, initialize its * range trees and add its capacity to the vdev. */ if (msp->ms_freed == NULL) { for (int t = 0; t < TXG_SIZE; t++) { ASSERT(msp->ms_allocating[t] == NULL); msp->ms_allocating[t] = range_tree_create(NULL, NULL); } ASSERT3P(msp->ms_freeing, ==, NULL); msp->ms_freeing = range_tree_create(NULL, NULL); ASSERT3P(msp->ms_freed, ==, NULL); msp->ms_freed = range_tree_create(NULL, NULL); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ASSERT(msp->ms_defer[t] == NULL); msp->ms_defer[t] = range_tree_create(NULL, NULL); } ASSERT3P(msp->ms_checkpointing, ==, NULL); msp->ms_checkpointing = range_tree_create(NULL, NULL); vdev_space_update(vd, 0, 0, msp->ms_size); } ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { defer_allowed = B_FALSE; } defer_delta = 0; alloc_delta = space_map_alloc_delta(msp->ms_sm); if (defer_allowed) { defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); } else { defer_delta -= range_tree_space(*defer_tree); } vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); /* * If there's a metaslab_load() in progress, wait for it to complete * so that we have a consistent view of the in-core space map. */ metaslab_load_wait(msp); /* * Move the frees from the defer_tree back to the free * range tree (if it's loaded). Swap the freed_tree and * the defer_tree -- this is safe to do because we've * just emptied out the defer_tree. */ range_tree_vacate(*defer_tree, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { range_tree_swap(&msp->ms_freed, defer_tree); } else { range_tree_vacate(msp->ms_freed, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); } space_map_update(msp->ms_sm); msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); if (msp->ms_deferspace != 0) { /* * Keep syncing this metaslab until all deferred frees * are back in circulation. */ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } if (msp->ms_new) { msp->ms_new = B_FALSE; mutex_enter(&mg->mg_lock); mg->mg_ms_ready++; mutex_exit(&mg->mg_lock); } /* * Calculate the new weights before unloading any metaslabs. * This will give us the most accurate weighting. */ metaslab_group_sort(mg, msp, metaslab_weight(msp) | (msp->ms_weight & METASLAB_ACTIVE_MASK)); /* * If the metaslab is loaded and we've not tried to load or allocate * from it in 'metaslab_unload_delay' txgs, then unload it. */ if (msp->ms_loaded && msp->ms_initializing == 0 && msp->ms_selected_txg + metaslab_unload_delay < txg) { for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } if (msp->ms_allocator != -1) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } if (!metaslab_debug_unload) metaslab_unload(msp); } ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); } void metaslab_sync_reassess(metaslab_group_t *mg) { spa_t *spa = mg->mg_class->mc_spa; spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); metaslab_group_alloc_update(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); /* * Preload the next potential metaslabs but only on active * metaslab groups. We can get into a state where the metaslab * is no longer active since we dirty metaslabs as we remove a * a device, thus potentially making the metaslab group eligible * for preloading. */ if (mg->mg_activation_count > 0) { metaslab_group_preload(mg); } spa_config_exit(spa, SCL_ALLOC, FTAG); } static uint64_t metaslab_distance(metaslab_t *msp, dva_t *dva) { uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; uint64_t start = msp->ms_id; if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (1ULL << 63); if (offset < start) return ((start - offset) << ms_shift); if (offset > start) return ((offset - start) << ms_shift); return (0); } /* * ========================================================================== * Metaslab allocation tracing facility * ========================================================================== */ kstat_t *metaslab_trace_ksp; kstat_named_t metaslab_trace_over_limit; void metaslab_alloc_trace_init(void) { ASSERT(metaslab_alloc_trace_cache == NULL); metaslab_alloc_trace_cache = kmem_cache_create( "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); if (metaslab_trace_ksp != NULL) { metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; kstat_named_init(&metaslab_trace_over_limit, "metaslab_trace_over_limit", KSTAT_DATA_UINT64); kstat_install(metaslab_trace_ksp); } } void metaslab_alloc_trace_fini(void) { if (metaslab_trace_ksp != NULL) { kstat_delete(metaslab_trace_ksp); metaslab_trace_ksp = NULL; } kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } /* * Add an allocation trace element to the allocation tracing list. */ static void metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, int allocator) { if (!metaslab_trace_enabled) return; /* * When the tracing list reaches its maximum we remove * the second element in the list before adding a new one. * By removing the second element we preserve the original * entry as a clue to what allocations steps have already been * performed. */ if (zal->zal_size == metaslab_trace_max_entries) { metaslab_alloc_trace_t *mat_next; #ifdef DEBUG panic("too many entries in allocation list"); #endif atomic_inc_64(&metaslab_trace_over_limit.value.ui64); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next); kmem_cache_free(metaslab_alloc_trace_cache, mat_next); } metaslab_alloc_trace_t *mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); list_link_init(&mat->mat_list_node); mat->mat_mg = mg; mat->mat_msp = msp; mat->mat_size = psize; mat->mat_dva_id = dva_id; mat->mat_offset = offset; mat->mat_weight = 0; mat->mat_allocator = allocator; if (msp != NULL) mat->mat_weight = msp->ms_weight; /* * The list is part of the zio so locking is not required. Only * a single thread will perform allocations for a given zio. */ list_insert_tail(&zal->zal_list, mat); zal->zal_size++; ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); } void metaslab_trace_init(zio_alloc_list_t *zal) { list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), offsetof(metaslab_alloc_trace_t, mat_list_node)); zal->zal_size = 0; } void metaslab_trace_fini(zio_alloc_list_t *zal) { metaslab_alloc_trace_t *mat; while ((mat = list_remove_head(&zal->zal_list)) != NULL) kmem_cache_free(metaslab_alloc_trace_cache, mat); list_destroy(&zal->zal_list); zal->zal_size = 0; } /* * ========================================================================== * Metaslab block operations * ========================================================================== */ static void metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, int allocator) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); } static void metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) { uint64_t max = mg->mg_max_alloc_queue_depth; uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; while (cur < max) { if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], cur, cur + 1) == cur) { atomic_inc_64( &mg->mg_class->mc_alloc_max_slots[allocator]); return; } cur = mg->mg_cur_max_alloc_queue_depth[allocator]; } } void metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, int allocator, boolean_t io_complete) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); if (io_complete) metaslab_group_increment_qdepth(mg, allocator); } void metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, int allocator) { #ifdef ZFS_DEBUG const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); for (int d = 0; d < ndvas; d++) { uint64_t vdev = DVA_GET_VDEV(&dva[d]); metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator], tag)); } #endif } static uint64_t metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) { uint64_t start; range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_initializing); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); range_tree_remove(rt, start, size); if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); /* Track the last successful allocation */ msp->ms_alloc_txg = txg; metaslab_verify_space(msp, txg); } /* * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ msp->ms_max_size = metaslab_block_maxsize(msp); return (start); } /* * Find the metaslab with the highest weight that is less than what we've * already tried. In the common case, this means that we will examine each * metaslab at most once. Note that concurrent callers could reorder metaslabs * by activation/passivation once we have dropped the mg_lock. If a metaslab is * activated by another thread, and we fail to allocate from the metaslab we * have selected, we may not try the newly-activated metaslab, and instead * activate another metaslab. This is not optimal, but generally does not cause * any problems (a possible exception being if every metaslab is completely full * except for the the newly-activated metaslab which we fail to examine). */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator, zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; metaslab_t *msp = avl_find(t, search, &idx); if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; if (!metaslab_should_allocate(msp, asize)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; } /* * If the selected metaslab is condensing or being * initialized, skip it. */ if (msp->ms_condensing || msp->ms_initializing > 0) continue; *was_active = msp->ms_allocator != -1; /* * If we're activating as primary, this is our first allocation * from this disk, so we don't need to check how close we are. * If the metaslab under consideration was already active, * we're getting desperate enough to steal another allocator's * metaslab, so we still don't care about distances. */ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) break; uint64_t target_distance = min_distance + (space_map_allocated(msp->ms_sm) != 0 ? 0 : min_distance >> 1); for (i = 0; i < d; i++) { if (metaslab_distance(msp, &dva[i]) < target_distance) break; } if (i == d) break; } if (msp != NULL) { search->ms_weight = msp->ms_weight; search->ms_start = msp->ms_start + 1; search->ms_allocator = msp->ms_allocator; search->ms_primary = msp->ms_primary; } return (msp); } /* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int allocator) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; uint64_t activation_weight; activation_weight = METASLAB_WEIGHT_PRIMARY; for (int i = 0; i < d; i++) { if (activation_weight == METASLAB_WEIGHT_PRIMARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_CLAIM; break; } } /* * If we don't have enough metaslabs active to fill the entire array, we * just use the 0th slot. */ if (mg->mg_ms_ready < mg->mg_allocators * 3) allocator = 0; ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); search->ms_weight = UINT64_MAX; search->ms_start = 0; /* * At the end of the metaslab tree are the already-active metaslabs, * first the primaries, then the secondaries. When we resume searching * through the tree, we need to consider ms_allocator and ms_primary so * we start in the location right after where we left off, and don't * accidentally loop forever considering the same metaslabs. */ search->ms_allocator = -1; search->ms_primary = B_TRUE; for (;;) { boolean_t was_active = B_FALSE; mutex_enter(&mg->mg_lock); if (activation_weight == METASLAB_WEIGHT_PRIMARY && mg->mg_primaries[allocator] != NULL) { msp = mg->mg_primaries[allocator]; was_active = B_TRUE; } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && mg->mg_secondaries[allocator] != NULL) { msp = mg->mg_secondaries[allocator]; was_active = B_TRUE; } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, min_distance, asize, allocator, zal, search, &was_active); } mutex_exit(&mg->mg_lock); if (msp == NULL) { kmem_free(search, sizeof (*search)); return (-1ULL); } mutex_enter(&msp->ms_lock); /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. We check the * active status first to see if we need to reselect * a new metaslab. */ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { mutex_exit(&msp->ms_lock); continue; } /* * If the metaslab is freshly activated for an allocator that * isn't the one we're allocating from, or if it's a primary and * we're seeking a secondary (or vice versa), we go back and * select a new metaslab. */ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && (msp->ms_allocator != -1) && (msp->ms_allocator != allocator || ((activation_weight == METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { mutex_exit(&msp->ms_lock); continue; } if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && activation_weight != METASLAB_WEIGHT_CLAIM) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } if (metaslab_activate(msp, allocator, activation_weight) != 0) { mutex_exit(&msp->ms_lock); continue; } msp->ms_selected_txg = txg; /* * Now that we have the lock, recheck to see if we should * continue to use this metaslab for this allocation. The * the metaslab is now loaded so metaslab_should_allocate() can * accurately determine if the allocation attempt should * proceed. */ if (!metaslab_should_allocate(msp, asize)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); goto next; } /* * If this metaslab is currently condensing then pick again as * we can't manipulate this metaslab until it's committed * to disk. If this metaslab is being initialized, we shouldn't * allocate from it since the allocated region might be * overwritten after allocation. */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING, allocator); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); continue; } else if (msp->ms_initializing > 0) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_INITIALIZING, allocator); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); continue; } offset = metaslab_block_alloc(msp, asize, txg); metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ metaslab_segment_may_passivate(msp); break; } next: ASSERT(msp->ms_loaded); /* * We were unable to allocate from this metaslab so determine * a new weight for this metaslab. Now that we have loaded * the metaslab we can provide a better hint to the metaslab * selector. * * For space-based metaslabs, we use the maximum block size. * This information is only available when the metaslab * is loaded and is more accurate than the generic free * space weight that was calculated by metaslab_weight(). * This information allows us to quickly compare the maximum * available allocation in the metaslab to the allocation * size being requested. * * For segment-based metaslabs, determine the new weight * based on the highest bucket in the range tree. We * explicitly use the loaded segment weight (i.e. the range * tree histogram) since it contains the space that is * currently available for allocation and is accurate * even within a sync pass. */ if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { uint64_t weight = metaslab_block_maxsize(msp); WEIGHT_SET_SPACEBASED(weight); metaslab_passivate(msp, weight); } else { metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); } /* * We have just failed an allocation attempt, check * that metaslab_should_allocate() agrees. Otherwise, * we may end up in an infinite loop retrying the same * metaslab. */ ASSERT(!metaslab_should_allocate(msp, asize)); mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); kmem_free(search, sizeof (*search)); return (offset); } static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int allocator) { uint64_t offset; ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, min_distance, dva, d, allocator); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { mg->mg_failed_allocations++; metaslab_trace_add(zal, mg, NULL, asize, d, TRACE_GROUP_FAILURE, allocator); if (asize == SPA_GANGBLOCKSIZE) { /* * This metaslab group was unable to allocate * the minimum gang block size so it must be out of * space. We must notify the allocation throttle * to start skipping allocation attempts to this * metaslab group until more space becomes available. * Note: this failure cannot be caused by the * allocation throttle since the allocation throttle * is only responsible for skipping devices and * not failing block allocations. */ mg->mg_no_free_space = B_TRUE; } } mg->mg_allocations++; mutex_exit(&mg->mg_lock); return (offset); } /* * If we have to write a ditto block (i.e. more than one DVA for a given BP) * on the same vdev as an existing DVA of this BP, then try to allocate it * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the * existing DVAs. */ int ditto_same_vdev_distance_shift = 3; /* * Allocate a block for the specified i/o. */ int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal, int allocator) { metaslab_group_t *mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; ASSERT(!DVA_IS_VALID(&dva[d])); /* * For testing, make some blocks above a certain size be gang blocks. */ if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); } /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mc_rotor or mc_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * * If we are doing ditto or log blocks, try to spread them across * consecutive vdevs. If we're forced to reuse a vdev before we've * allocated all of our ditto blocks, then try and spread them out on * that vdev as much as possible. If it turns out to not be possible, * gradually lower our standards until anything becomes acceptable. * Also, allocating on consecutive vdevs (as opposed to random vdevs) * gives us hope of containing our fault domains to something we're * able to reason about. Otherwise, any two top-level vdev failures * will guarantee the loss of data. With consecutive allocation, * only two adjacent top-level vdev failures will result in data loss. * * If we are doing gang blocks (hintdva is non-NULL), try to keep * ourselves on the same vdev as our gang block header. That * way, we can hope for locality in vdev_cache, plus it makes our * fault domains something tractable. */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); /* * It's possible the vdev we're using as the hint no * longer exists or its mg has been closed (e.g. by * device removal). Consult the rotor when * all else fails. */ if (vd != NULL && vd->vdev_mg != NULL) { mg = vd->vdev_mg; if (flags & METASLAB_HINTBP_AVOID && mg->mg_next != NULL) mg = mg->mg_next; } else { mg = mc->mc_rotor; } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; } else { mg = mc->mc_rotor; } /* * If the hint put us into the wrong metaslab class, or into a * metaslab group that has been passivated, just follow the rotor. */ if (mg->mg_class != mc || mg->mg_activation_count <= 0) mg = mc->mc_rotor; rotor = mg; top: do { boolean_t allocatable; ASSERT(mg->mg_activation_count == 1); vd = mg->mg_vd; /* * Don't allocate from faulted devices. */ if (try_hard) { spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); allocatable = vdev_allocatable(vd); spa_config_exit(spa, SCL_ZIO, FTAG); } else { allocatable = vdev_allocatable(vd); } /* * Determine if the selected metaslab group is eligible * for allocations. If we're ganging then don't allow * this metaslab group to skip allocations since that would * inadvertently return ENOSPC and suspend the pool * even though space is still available. */ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, psize, allocator, d); } if (!allocatable) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_NOT_ALLOCATABLE, allocator); goto next; } ASSERT(mg->mg_initialized); /* * Avoid writing single-copy data to a failing, * non-redundant vdev, unless we've already tried all * other vdevs. */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && d == 0 && !try_hard && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_VDEV_ERROR, allocator); goto next; } ASSERT(mg->mg_class == mc); /* * If we don't need to try hard, then require that the * block be 1/8th of the device away from any other DVAs * in this BP. If we are trying hard, allow any offset * to be used (distance=0). */ uint64_t distance = 0; if (!try_hard) { distance = vd->vdev_asize >> ditto_same_vdev_distance_shift; if (distance <= (1ULL << vd->vdev_ms_shift)) distance = 0; } uint64_t asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, distance, dva, d, allocator); if (offset != -1ULL) { /* * If we've just selected this metaslab group, * figure out whether the corresponding vdev is * over- or under-used relative to the pool, * and set an allocation bias to even it out. */ if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vu, cu; vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); /* * Calculate how much more or less we should * try to allocate from this device during * this iteration around the rotor. * For example, if a device is 80% full * and the pool is 20% full then we should * reduce allocations by 60% on this device. * * mg_bias = (20 - 80) * 512K / 100 = -307K * * This reduces allocations by 307K for this * iteration. */ mg->mg_bias = ((cu - vu) * (int64_t)mg->mg_aliquot) / 100; } else if (!metaslab_bias_enabled) { mg->mg_bias = 0; } if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mc->mc_rotor = mg->mg_next; mc->mc_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_OFFSET(&dva[d], offset); DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); DVA_SET_ASIZE(&dva[d], asize); return (0); } next: mc->mc_rotor = mg->mg_next; mc->mc_aliquot = 0; } while ((mg = mg->mg_next) != rotor); /* * If we haven't tried hard, do so now. */ if (!try_hard) { try_hard = B_TRUE; goto top; } bzero(&dva[d], sizeof (dva_t)); metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); } void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, boolean_t checkpoint) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; ASSERT(vdev_is_concrete(vd)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); metaslab_check_free_impl(vd, offset, asize); mutex_enter(&msp->ms_lock); if (range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing)) { vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); } if (checkpoint) { ASSERT(spa_has_checkpoint(spa)); range_tree_add(msp->ms_checkpointing, offset, asize); } else { range_tree_add(msp->ms_freeing, offset, asize); } mutex_exit(&msp->ms_lock); } /* ARGSUSED */ void metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { boolean_t *checkpoint = arg; ASSERT3P(checkpoint, !=, NULL); if (vd->vdev_ops->vdev_op_remap != NULL) vdev_indirect_mark_obsolete(vd, offset, size); else metaslab_free_impl(vd, offset, size, *checkpoint); } static void metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, boolean_t checkpoint) { spa_t *spa = vd->vdev_spa; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) return; if (spa->spa_vdev_removal != NULL && spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && vdev_is_concrete(vd)) { /* * Note: we check if the vdev is concrete because when * we complete the removal, we first change the vdev to be * an indirect vdev (in open context), and then (in syncing * context) clear spa_vdev_removal. */ free_from_removing_vdev(vd, offset, size); } else if (vd->vdev_ops->vdev_op_remap != NULL) { vdev_indirect_mark_obsolete(vd, offset, size); vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } else { metaslab_free_concrete(vd, offset, size, checkpoint); } } typedef struct remap_blkptr_cb_arg { blkptr_t *rbca_bp; spa_remap_cb_t rbca_cb; vdev_t *rbca_remap_vd; uint64_t rbca_remap_offset; void *rbca_cb_arg; } remap_blkptr_cb_arg_t; void remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { remap_blkptr_cb_arg_t *rbca = arg; blkptr_t *bp = rbca->rbca_bp; /* We can not remap split blocks. */ if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) return; ASSERT0(inner_offset); if (rbca->rbca_cb != NULL) { /* * At this point we know that we are not handling split * blocks and we invoke the callback on the previous * vdev which must be indirect. */ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); /* set up remap_blkptr_cb_arg for the next call */ rbca->rbca_remap_vd = vd; rbca->rbca_remap_offset = offset; } /* * The phys birth time is that of dva[0]. This ensures that we know * when each dva was written, so that resilver can determine which * blocks need to be scrubbed (i.e. those written during the time * the vdev was offline). It also ensures that the key used in * the ARC hash table is unique (i.e. dva[0] + phys_birth). If * we didn't change the phys_birth, a lookup in the ARC for a * remapped BP could find the data that was previously stored at * this vdev + offset. */ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, DVA_GET_VDEV(&bp->blk_dva[0])); vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); } /* * If the block pointer contains any indirect DVAs, modify them to refer to * concrete DVAs. Note that this will sometimes not be possible, leaving * the indirect DVA in place. This happens if the indirect DVA spans multiple * segments in the mapping (i.e. it is a "split block"). * * If the BP was remapped, calls the callback on the original dva (note the * callback can be called multiple times if the original indirect DVA refers * to another indirect DVA, etc). * * Returns TRUE if the BP was remapped. */ boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) { remap_blkptr_cb_arg_t rbca; if (!zfs_remap_blkptr_enable) return (B_FALSE); if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) return (B_FALSE); /* * Dedup BP's can not be remapped, because ddt_phys_select() depends * on DVA[0] being the same in the BP as in the DDT (dedup table). */ if (BP_GET_DEDUP(bp)) return (B_FALSE); /* * Gang blocks can not be remapped, because * zio_checksum_gang_verifier() depends on the DVA[0] that's in * the BP used to read the gang block header (GBH) being the same * as the DVA[0] that we allocated for the GBH. */ if (BP_IS_GANG(bp)) return (B_FALSE); /* * Embedded BP's have no DVA to remap. */ if (BP_GET_NDVAS(bp) < 1) return (B_FALSE); /* * Note: we only remap dva[0]. If we remapped other dvas, we * would no longer know what their phys birth txg is. */ dva_t *dva = &bp->blk_dva[0]; uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops->vdev_op_remap == NULL) return (B_FALSE); rbca.rbca_bp = bp; rbca.rbca_cb = callback; rbca.rbca_remap_vd = vd; rbca.rbca_remap_offset = offset; rbca.rbca_cb_arg = arg; /* * remap_blkptr_cb() will be called in order for each level of * indirection, until a concrete vdev is reached or a split block is * encountered. old_vd and old_offset are updated within the callback * as we go from the one indirect vdev to the next one (either concrete * or indirect again) in that order. */ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); /* Check if the DVA wasn't remapped because it is a split block */ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) return (B_FALSE); return (B_TRUE); } /* * Undo the allocation of a DVA which happened in the given transaction group. */ void metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { metaslab_t *msp; vdev_t *vd; uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (txg > spa_freeze_txg(spa)) return; if ((vd = vdev_lookup_top(spa, vdev)) == NULL || (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", (u_longlong_t)vdev, (u_longlong_t)offset); ASSERT(0); return; } ASSERT(!vd->vdev_removing); ASSERT(vdev_is_concrete(vd)); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); if (DVA_GET_GANG(dva)) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); range_tree_add(msp->ms_allocatable, offset, size); mutex_exit(&msp->ms_lock); } /* * Free the block represented by the given DVA. */ void metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, vdev); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (DVA_GET_GANG(dva)) { size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); } metaslab_free_impl(vd, offset, size, checkpoint); } /* * Reserve some allocation slots. The reservation system must be called * before we call into the allocator. If there aren't any available slots * then the I/O will be throttled until an I/O completes and its slots are * freed up. The function returns true if it was successful in placing * the reservation. */ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio, int flags) { uint64_t available_slots = 0; boolean_t slot_reserved = B_FALSE; uint64_t max = mc->mc_alloc_max_slots[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots[allocator]); if (reserved_slots < max) available_slots = max - reserved_slots; if (slots <= available_slots || GANG_ALLOCATION(flags)) { /* * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ for (int d = 0; d < slots; d++) { reserved_slots = refcount_add(&mc->mc_alloc_slots[allocator], zio); } zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; slot_reserved = B_TRUE; } mutex_exit(&mc->mc_lock); return (slot_reserved); } void metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio) { ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); for (int d = 0; d < slots; d++) { (void) refcount_remove(&mc->mc_alloc_slots[allocator], zio); } mutex_exit(&mc->mc_lock); } static int metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; int error = 0; if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) return (ENXIO); ASSERT3P(vd->vdev_ms, !=, NULL); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); /* * No need to fail in that case; someone else has activated the * metaslab, but that doesn't preclude us from using it. */ if (error == EBUSY) error = 0; if (error == 0 && !range_tree_contains(msp->ms_allocatable, offset, size)) error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } VERIFY(!msp->ms_condensing); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, msp->ms_size); range_tree_remove(msp->ms_allocatable, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], offset, size); } mutex_exit(&msp->ms_lock); return (0); } typedef struct metaslab_claim_cb_arg_t { uint64_t mcca_txg; int mcca_error; } metaslab_claim_cb_arg_t; /* ARGSUSED */ static void metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { metaslab_claim_cb_arg_t *mcca_arg = arg; if (mcca_arg->mcca_error == 0) { mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, size, mcca_arg->mcca_txg); } } int metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { if (vd->vdev_ops->vdev_op_remap != NULL) { metaslab_claim_cb_arg_t arg; /* * Only zdb(1M) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ ASSERT(!spa_writeable(vd->vdev_spa)); arg.mcca_error = 0; arg.mcca_txg = txg; vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_claim_impl_cb, &arg); if (arg.mcca_error == 0) { arg.mcca_error = metaslab_claim_concrete(vd, offset, size, txg); } return (arg.mcca_error); } else { return (metaslab_claim_concrete(vd, offset, size, txg)); } } /* * Intent log support: upon opening the pool after a crash, notify the SPA * of blocks that the intent log has allocated for immediate write, but * which are still considered free by the SPA because the last transaction * group didn't commit yet. */ static int metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { return (SET_ERROR(ENXIO)); } ASSERT(DVA_IS_VALID(dva)); if (DVA_GET_GANG(dva)) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); return (metaslab_claim_impl(vd, offset, size, txg)); } int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, zio_t *zio, int allocator) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; int error = 0; ASSERT(bp->blk_birth == 0); ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); if (mc->mc_rotor == NULL) { /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); return (SET_ERROR(ENOSPC)); } ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator, B_FALSE); bzero(&dva[d], sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); } else { /* * Update the metaslab group's queue depth * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); spa_config_exit(spa, SCL_ALLOC, FTAG); BP_SET_BIRTH(bp, txg, txg); return (0); } void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that * the blocks that we free that are part of the checkpoint won't be * reused until the checkpoint is discarded or we revert to it. * * The checkpoint flag is passed down the metaslab_free code path * and is set whenever we want to add a block to the checkpoint's * accounting. That is, we "checkpoint" blocks that existed at the * time the checkpoint was created and are therefore referenced by * the checkpointed uberblock. * * Note that, we don't checkpoint any blocks if the current * syncing txg <= spa_checkpoint_txg. We want these frees to sync * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; if (bp->blk_birth <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint * there is no way it was created in the current txg. */ ASSERT(!now); ASSERT3U(spa_syncing_txg(spa), ==, txg); checkpoint = B_TRUE; } spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { if (now) { metaslab_unalloc_dva(spa, &dva[d], txg); } else { ASSERT3U(txg, ==, spa_syncing_txg(spa)); metaslab_free_dva(spa, &dva[d], checkpoint); } } spa_config_exit(spa, SCL_FREE, FTAG); } int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); int error = 0; ASSERT(!BP_IS_HOLE(bp)); if (txg != 0) { /* * First do a dry run to make sure all DVAs are claimable, * so we don't have to unwind from partial failures below. */ if ((error = metaslab_claim(spa, bp, 0)) != 0) return (error); } spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) break; spa_config_exit(spa, SCL_ALLOC, FTAG); ASSERT(error == 0 || txg == 0); return (error); } /* ARGSUSED */ static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { if (vd->vdev_ops == &vdev_indirect_ops) return; metaslab_check_free_impl(vd, offset, size); } static void metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; if (vd->vdev_ops->vdev_op_remap != NULL) { vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_check_free_impl_cb, NULL); return; } ASSERT(vdev_is_concrete(vd)); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if (msp->ms_loaded) range_tree_verify(msp->ms_allocatable, offset, size); range_tree_verify(msp->ms_freeing, offset, size); range_tree_verify(msp->ms_checkpointing, offset, size); range_tree_verify(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) range_tree_verify(msp->ms_defer[j], offset, size); mutex_exit(&msp->ms_lock); } void metaslab_check_free(spa_t *spa, const blkptr_t *bp) { if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); if (DVA_GET_GANG(&bp->blk_dva[i])) size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); ASSERT3P(vd, !=, NULL); metaslab_check_free_impl(vd, offset, size); } spa_config_exit(spa, SCL_VDEV, FTAG); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h (revision 344935) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h (revision 344936) @@ -1,545 +1,543 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_IMPL_H #define _SYS_VDEV_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Virtual device descriptors. * * All storage pool operations go through the virtual device framework, * which provides data replication and I/O scheduling. */ /* * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; typedef struct vdev_cache vdev_cache_t; typedef struct vdev_cache_entry vdev_cache_entry_t; struct abd; extern int zfs_vdev_queue_depth_pct; extern int zfs_vdev_def_queue_depth; extern uint32_t zfs_vdev_async_write_max_active; /* * Virtual device operations */ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *logical_ashift, uint64_t *physical_ashift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg); typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, vdev_remap_cb_t callback, void *arg); /* * Given a target vdev, translates the logical range "in" to the physical * range "res" */ typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in, range_seg_t *res); typedef struct vdev_ops { vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; vdev_asize_func_t *vdev_op_asize; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; vdev_need_resilver_func_t *vdev_op_need_resilver; vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; /* * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. * Used when initializing vdevs. Isn't used by leaf ops. */ vdev_xlation_func_t *vdev_op_xlate; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; /* * Virtual device properties */ struct vdev_cache_entry { struct abd *ve_abd; uint64_t ve_offset; uint64_t ve_lastused; avl_node_t ve_offset_node; avl_node_t ve_lastused_node; uint32_t ve_hits; uint16_t ve_missed_update; zio_t *ve_fill_io; }; struct vdev_cache { avl_tree_t vc_offset_tree; avl_tree_t vc_lastused_tree; kmutex_t vc_lock; }; typedef struct vdev_queue_class { uint32_t vqc_active; /* * Sorted by offset or timestamp, depending on if the queue is * LBA-ordered vs FIFO. */ avl_tree_t vqc_queued_tree; } vdev_queue_class_t; struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; avl_tree_t vq_active_tree; avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; hrtime_t vq_io_complete_ts; /* time last i/o completed */ kmutex_t vq_lock; uint64_t vq_lastoffset; }; /* * On-disk indirect vdev state. * * An indirect vdev is described exclusively in the MOS config of a pool. * The config for an indirect vdev includes several fields, which are * accessed in memory by a vdev_indirect_config_t. */ typedef struct vdev_indirect_config { /* * Object (in MOS) which contains the indirect mapping. This object * contains an array of vdev_indirect_mapping_entry_phys_t ordered by * vimep_src. The bonus buffer for this object is a * vdev_indirect_mapping_phys_t. This object is allocated when a vdev * removal is initiated. * * Note that this object can be empty if none of the data on the vdev * has been copied yet. */ uint64_t vic_mapping_object; /* * Object (in MOS) which contains the birth times for the mapping * entries. This object contains an array of * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus * buffer for this object is a vdev_indirect_birth_phys_t. This object * is allocated when a vdev removal is initiated. * * Note that this object can be empty if none of the vdev has yet been * copied. */ uint64_t vic_births_object; /* * This is the vdev ID which was removed previous to this vdev, or * UINT64_MAX if there are no previously removed vdevs. */ uint64_t vic_prev_indirect_vdev; } vdev_indirect_config_t; /* * Virtual device descriptor */ struct vdev { /* * Common to all vdev types. */ uint64_t vdev_id; /* child number in vdev parent */ uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid_sum; /* self guid + all child guids */ uint64_t vdev_orig_guid; /* orig. guid prior to remove */ uint64_t vdev_asize; /* allocatable device capacity */ uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_max_asize; /* max acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ /* * Logical block alignment shift * * The smallest sized/aligned I/O supported by the device. */ uint64_t vdev_logical_ashift; /* * Physical block alignment shift * * The device supports logical I/Os with vdev_logical_ashift * size/alignment, but optimum performance will be achieved by * aligning/sizing requests to vdev_physical_ashift. Smaller * requests may be inflated or incur device level read-modify-write * operations. * * May be 0 to indicate no preference (i.e. use vdev_logical_ashift). */ uint64_t vdev_physical_ashift; uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ vdev_ops_t *vdev_ops; /* vdev operations */ spa_t *vdev_spa; /* spa for this vdev */ void *vdev_tsd; /* type-specific data */ vnode_t *vdev_name_vp; /* vnode for pathname */ vnode_t *vdev_devid_vp; /* vnode for devid */ vdev_t *vdev_top; /* top-level vdev */ vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ vdev_stat_t vdev_stat; /* virtual device statistics */ boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ + boolean_t vdev_nonrot; /* true if solid state */ int vdev_open_error; /* error on last open */ kthread_t *vdev_open_thread; /* thread opening children */ uint64_t vdev_crtxg; /* txg when top-level was added */ /* * Top-level vdev state. */ uint64_t vdev_ms_array; /* metaslab array object */ uint64_t vdev_ms_shift; /* metaslab size shift */ uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ boolean_t vdev_probe_wanted; /* async probe wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_removing; /* device is being removed? */ boolean_t vdev_ishole; /* is a hole in the namespace */ kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ uint64_t vdev_top_zap; /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ boolean_t vdev_initialize_exit_wanted; vdev_initializing_state_t vdev_initialize_state; kthread_t *vdev_initialize_thread; /* Protects vdev_initialize_thread and vdev_initialize_state. */ kmutex_t vdev_initialize_lock; kcondvar_t vdev_initialize_cv; uint64_t vdev_initialize_offset[TXG_SIZE]; uint64_t vdev_initialize_last_offset; range_tree_t *vdev_initialize_tree; /* valid while initializing */ uint64_t vdev_initialize_bytes_est; uint64_t vdev_initialize_bytes_done; time_t vdev_initialize_action_time; /* start and end time */ /* for limiting outstanding I/Os */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; /* * Values stored in the config for an indirect or removing vdev. */ vdev_indirect_config_t vdev_indirect_config; /* * The vdev_indirect_rwlock protects the vdev_indirect_mapping * pointer from changing on indirect vdevs (when it is condensed). * Note that removing (not yet indirect) vdevs have different * access patterns (the mapping is not accessed from open context, * e.g. from zio_read) and locking strategy (e.g. svr_lock). */ krwlock_t vdev_indirect_rwlock; vdev_indirect_mapping_t *vdev_indirect_mapping; vdev_indirect_births_t *vdev_indirect_births; /* * In memory data structures used to manage the obsolete sm, for * indirect or removing vdevs. * * The vdev_obsolete_segments is the in-core record of the segments * that are no longer referenced anywhere in the pool (due to * being freed or remapped and not referenced by any snapshots). * During a sync, segments are added to vdev_obsolete_segments * via vdev_indirect_mark_obsolete(); at the end of each sync * pass, this is appended to vdev_obsolete_sm via * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock * protects against concurrent modifications of vdev_obsolete_segments * from multiple zio threads. */ kmutex_t vdev_obsolete_lock; range_tree_t *vdev_obsolete_segments; space_map_t *vdev_obsolete_sm; /* * The queue depth parameters determine how many async writes are * still pending (i.e. allocated by net yet issued to disk) per * top-level (vdev_async_write_queue_depth) and the maximum allowed * (vdev_max_async_write_queue_depth). These values only apply to * top-level vdevs. */ uint64_t vdev_async_write_queue_depth; uint64_t vdev_max_async_write_queue_depth; /* * Protects the vdev_scan_io_queue field itself as well as the * structure's contents (when present). */ kmutex_t vdev_scan_io_queue_lock; struct dsl_scan_io_queue *vdev_scan_io_queue; /* * Leaf vdev state. */ range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ space_map_t *vdev_dtl_sm; /* dirty time log space map */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_dtl_object; /* DTL object */ uint64_t vdev_psize; /* physical device capacity */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ uint64_t vdev_offline; /* persistent offline state */ uint64_t vdev_faulted; /* persistent faulted state */ uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ char *vdev_fru; /* physical FRU location */ uint64_t vdev_not_present; /* not present during import */ uint64_t vdev_unspare; /* unspare when resilvering done */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ boolean_t vdev_notrim; /* true if trim failed */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ boolean_t vdev_splitting; /* split or repair in progress */ boolean_t vdev_delayed_close; /* delayed device close? */ boolean_t vdev_tmpoffline; /* device taken offline temporarily? */ boolean_t vdev_detached; /* device detached? */ boolean_t vdev_cant_read; /* vdev is failing all reads */ boolean_t vdev_cant_write; /* vdev is failing all writes */ boolean_t vdev_isspare; /* was a hot spare */ boolean_t vdev_isl2cache; /* was a l2cache device */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ vdev_cache_t vdev_cache; /* physical block cache */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ struct trim_map *vdev_trimmap; /* map on outstanding trims */ - uint16_t vdev_rotation_rate; /* rotational rate of the media */ -#define VDEV_RATE_UNKNOWN 0 -#define VDEV_RATE_NON_ROTATING 1 uint64_t vdev_leaf_zap; /* * For DTrace to work in userland (libzpool) context, these fields must * remain at the end of the structure. DTrace will use the kernel's * CTF definition for 'struct vdev', and since the size of a kmutex_t is * larger in userland, the offsets for the rest of the fields would be * incorrect. */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ kmutex_t vdev_stat_lock; /* vdev_stat */ kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ }; #define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_pad2) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) /* The largest uberblock we support is 8k. */ #define MAX_UBERBLOCK_SHIFT (13) #define VDEV_UBERBLOCK_SHIFT(vd) \ MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ MAX_UBERBLOCK_SHIFT) #define VDEV_UBERBLOCK_COUNT(vd) \ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_UBERBLOCK_OFFSET(vd, n) \ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) typedef struct vdev_phys { char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; zio_eck_t vp_zbt; } vdev_phys_t; typedef struct vdev_label { char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ /* * vdev_dirty() flags */ #define VDD_METASLAB 0x01 #define VDD_DTL 0x02 /* Offset of embedded boot loader region on each label */ #define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) /* * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. */ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ /* * Size of label regions at the start and end of each leaf device. */ #define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 #define VDEV_BEST_LABEL VDEV_LABELS #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_L2CACHE 3 #define VDEV_ALLOC_ROOTPOOL 4 #define VDEV_ALLOC_SPLIT 5 #define VDEV_ALLOC_ATTACH 6 /* * Allocate or free a vdev */ extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops); extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, vdev_t *parent, uint_t id, int alloctype); extern void vdev_free(vdev_t *vd); /* * Add or remove children and parents */ extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_compact_children(vdev_t *pvd); extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ extern boolean_t vdev_log_state_valid(vdev_t *vd); extern int vdev_load(vdev_t *vd); extern int vdev_dtl_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg); /* * Available vdev types. */ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; #ifdef _KERNEL extern vdev_ops_t vdev_geom_ops; #else extern vdev_ops_t vdev_disk_ops; #endif extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *out); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); /* * Global variables */ extern int vdev_standard_sm_blksz; /* zdb uses this tunable, so it must be declared here to make lint happy. */ extern int zfs_vdev_cache_size; extern uint_t zfs_geom_probe_vdev_key; /* * Functions from vdev_indirect.c */ extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx); extern boolean_t vdev_indirect_should_condense(vdev_t *vd); extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); extern int vdev_obsolete_sm_object(vdev_t *vd); extern boolean_t vdev_obsolete_counts_are_precise(vdev_t *vd); #ifdef illumos /* * Other miscellaneous functions */ int vdev_checkpoint_sm_object(vdev_t *vd); /* * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. */ typedef struct vdev_buf { buf_t vb_buf; /* buffer that describes the io */ zio_t *vb_io; /* pointer back to the original zio_t */ } vdev_buf_t; #endif #ifdef __cplusplus } #endif #endif /* _SYS_VDEV_IMPL_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (revision 344935) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (revision 344936) @@ -1,4371 +1,4378 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright 2013 Martin Matuska . All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); /* * Virtual device management. */ /* * The limit for ZFS to automatically increase a top-level vdev's ashift * from logical ashift to physical ashift. * * Example: one or more 512B emulation child vdevs * child->vdev_ashift = 9 (512 bytes) * child->vdev_physical_ashift = 12 (4096 bytes) * zfs_max_auto_ashift = 11 (2048 bytes) * zfs_min_auto_ashift = 9 (512 bytes) * * On pool creation or the addition of a new top-level vdev, ZFS will * increase the ashift of the top-level vdev to 2048 as limited by * zfs_max_auto_ashift. * * Example: one or more 512B emulation child vdevs * child->vdev_ashift = 9 (512 bytes) * child->vdev_physical_ashift = 12 (4096 bytes) * zfs_max_auto_ashift = 13 (8192 bytes) * zfs_min_auto_ashift = 9 (512 bytes) * * On pool creation or the addition of a new top-level vdev, ZFS will * increase the ashift of the top-level vdev to 4096 to match the * max vdev_physical_ashift. * * Example: one or more 512B emulation child vdevs * child->vdev_ashift = 9 (512 bytes) * child->vdev_physical_ashift = 9 (512 bytes) * zfs_max_auto_ashift = 13 (8192 bytes) * zfs_min_auto_ashift = 12 (4096 bytes) * * On pool creation or the addition of a new top-level vdev, ZFS will * increase the ashift of the top-level vdev to 4096 to match the * zfs_min_auto_ashift. */ static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; static int sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_max_auto_ashift; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) return (EINVAL); zfs_max_auto_ashift = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), sysctl_vfs_zfs_max_auto_ashift, "QU", "Max ashift used when optimising for logical -> physical sectors size on " "new top-level vdevs."); static int sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_min_auto_ashift; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) return (EINVAL); zfs_min_auto_ashift = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), sysctl_vfs_zfs_min_auto_ashift, "QU", "Min ashift used when creating new top-level vdevs."); static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, #ifdef _KERNEL &vdev_geom_ops, #else &vdev_disk_ops, #endif &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* target number of metaslabs per top-level vdev */ int vdev_max_ms_count = 200; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RWTUN, &vdev_max_ms_count, 0, "Target number of metaslabs per top-level vdev"); /* minimum number of metaslabs per top-level vdev */ int vdev_min_ms_count = 16; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN, &vdev_min_ms_count, 0, "Minimum number of metaslabs per top-level vdev"); /* practical upper limit of total metaslabs per top-level vdev */ int vdev_ms_count_limit = 1ULL << 17; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN, &vdev_ms_count_limit, 0, "Maximum number of metaslabs per top-level vdev"); /* lower limit for metaslab size (512M) */ int vdev_default_ms_shift = 29; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN, &vdev_default_ms_shift, 0, "Default shift between vdev size and number of metaslabs"); /* upper limit for metaslab size (256G) */ int vdev_max_ms_shift = 38; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN, &vdev_max_ms_shift, 0, "Maximum shift between vdev size and number of metaslabs"); boolean_t vdev_validate_skip = B_FALSE; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, validate_skip, CTLFLAG_RWTUN, &vdev_validate_skip, 0, "Bypass vdev validation"); /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. */ int vdev_dtl_sm_blksz = (1 << 12); SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, &vdev_dtl_sm_blksz, 0, "Block size for DTL space map. Power of 2 and greater than 4096."); /* * vdev-wide space maps that have lots of entries written to them at * the end of each transaction can benefit from a higher I/O bandwidth * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. */ int vdev_standard_sm_blksz = (1 << 17); SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, &vdev_standard_sm_blksz, 0, "Block size for standard space map. Power of 2 and greater than 4096."); /*PRINTFLIKE2*/ void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); if (vd->vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, **opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* ARGSUSED */ void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) { res->rs_start = in->rs_start; res->rs_end = in->rs_end; } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); /* * The allocatable space for a raidz vdev is N * sizeof(smallest child), * so each child must provide at least 1/Nth of its asize. */ if (pvd->vdev_ops == &vdev_raidz_ops) return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / pvd->vdev_children); return (pvd->vdev_min_asize); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { return (vdev_count_leaves_impl(spa->spa_root_vdev)); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; spa_t *spa = cvd->vdev_spa; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_zalloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { bcopy(pvd->vdev_child, newchild, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); vdev_cache_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; char *type; uint64_t guid = 0, islog, nparity; vdev_t *vd; vdev_indirect_config_t *vic; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); /* * Set the nparity property for RAID-Z vdevs. */ nparity = -1ULL; if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(ENOTSUP)); if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(ENOTSUP)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } } else { nparity = 0; } ASSERT(nparity != -1ULL); vd = vdev_alloc_common(spa, id, guid, ops); vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); /* * If we're a top-level vdev, try to load the allocation parameters. */ if (parent && !parent->vdev_parent && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); } else { ASSERT0(vd->vdev_top_zap); } if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); vd->vdev_mg = metaslab_group_create(islog ? spa_log_class(spa) : spa_normal_class(spa), vd, spa->spa_alloc_count); } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); /* * When importing a pool, we want to ignore the persistent fault * state, as the diagnosis made on another system may not be * valid in the current context. Local vdevs will * remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); ASSERT(vd->vdev_initialize_thread == NULL); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); /* * Clean up vdev structure. */ vdev_queue_fini(vd); vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_removing); tvd->vdev_removing = svd->vdev_removing; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_removing = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (oldc != 0) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is being removed we don't activate * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ if (oldc == 0 && !vd->vdev_removing) metaslab_group_activate(vd->vdev_mg); if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { uint64_t count = vd->vdev_ms_count; metaslab_group_passivate(vd->vdev_mg); for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; } ASSERT0(vd->vdev_ms_count); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zio_link_t *zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } boolean_t vdev_uses_zvols(vdev_t *vd) { if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } void vdev_open_children(vdev_t *vd) { taskq_t *tq; int children = vd->vdev_children; + vd->vdev_nonrot = B_TRUE; + /* * in order to handle pools on top of zvols, do the opens * in a single thread so that the same thread holds the * spa_namespace_lock */ if (B_TRUE || vdev_uses_zvols(vd)) { - for (int c = 0; c < children; c++) + for (int c = 0; c < children; c++) { vd->vdev_child[c]->vdev_open_error = vdev_open(vd->vdev_child[c]); + vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; + } return; } tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); for (int c = 0; c < children; c++) VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], TQ_SLEEP) != 0); taskq_destroy(tq); + + for (int c = 0; c < children; c++) + vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; } /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the "typical" blocksize. * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, * otherwise it would inconsistently account for existing bp's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_notrim = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, ENXIO); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) trim_map_create(vd); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (EINVAL); } if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For testing purposes, a higher ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; } else { /* * Make sure the alignment requirement hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (EINVAL); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the min and max ashift values for normal data devices. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && !vd->vdev_islog && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; } /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a scrub, * since this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && vdev_resilver_needed(vd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); return (0); } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; if (vdev_validate_skip) return (0); for (uint64_t c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c]) != 0) return (SET_ERROR(EBADF)); /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path, svd->vdev_path); spa_strfree(dvd->vdev_path); dvd->vdev_path = spa_strdup(svd->vdev_path); } } else if (svd->vdev_path != NULL) { dvd->vdev_path = spa_strdup(svd->vdev_path); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *pvd = vd->vdev_parent; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); if (vd->vdev_ops->vdev_op_leaf) trim_map_destroy(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache && !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { (void) vdev_validate(vd); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : ENXIO); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * In general, we aim for vdev_max_ms_count (200) metaslabs. The * range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^38 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 256GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * -------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 50TB ~200 * 50TB - 32PB one per 256GB * > 32PB ~131,072 * ------------------------------- */ if (ms_count < vdev_min_ms_count) ms_shift = highbit64(asize / vdev_min_ms_count); else if (ms_count > vdev_max_ms_count) ms_shift = highbit64(asize / vdev_max_ms_count); else ms_shift = vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > vdev_max_ms_shift) { ms_shift = vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > vdev_ms_count_limit) ms_shift = highbit64(asize / vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ void vdev_ashift_optimize(vdev_t *vd) { if (vd == vd->vdev_top) { if (vd->vdev_ashift < vd->vdev_physical_ashift) { vd->vdev_ashift = MIN( MAX(zfs_max_auto_ashift, vd->vdev_ashift), MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * Unusual case where logical ashift > physical ashift * so we can't cap the calculated ashift based on max * ashift as that would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_min_auto_ashift, vd->vdev_ashift); } } } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * Ignore the DTLs and try all devices. This avoids a recursive * mutex enter on the vdev_dtl_lock, and also makes us try hard * when loading the pool (relying on the checksum to ensure that * we get the right data -- note that we while loading, we are * only reading the MOS, which is always checksummed). */ if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Returns B_TRUE if vdev determines offset needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_start - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_end); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd) { spa_t *spa = vd->vdev_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_txg == 0 || range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); /* * When a resilver is initiated the scan will assign the scn_max_txg * value to the highest txg value that exists in all DTLs. If this * device's max DTL is not part of this scan (i.e. it is not in * the range (scn_min_txg, scn_max_txg] then it is not eligible * for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); return (B_TRUE); } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; mutex_enter(&vd->vdev_dtl_lock); /* * If we've completed a scan cleanly then determine * if this vdev should remove any DTLs. We only want to * excise regions on vdevs that were available during * the entire duration of this scan. */ if (scrub_txg != 0 && (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) && vdev_dtl_should_excise(vd)) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl * will be valid, so excise the old region and * fold in the scrub dtl. Otherwise, leave the * dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering and no longer has any * DTLs then reset its resilvering flag and dirty * the top level so that we persist the change. */ if (vd->vdev_resilver_txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vd->vdev_nparity != 0) minref = vd->vdev_nparity + 1; /* RAID-Z */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); mutex_enter(&vd->vdev_dtl_lock); /* * Now that we've opened the space_map we need to update * the in-core DTL. */ space_map_update(vd->vdev_dtl_sm); error = space_map_load(vd->vdev_dtl_sm, vd->vdev_dtl[DTL_MISSING], SM_ALLOC); mutex_exit(&vd->vdev_dtl_lock); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); } } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = range_tree_create(NULL, NULL); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); mutex_enter(&vd->vdev_dtl_lock); space_map_update(vd->vdev_dtl_sm); mutex_exit(&vd->vdev_dtl_lock); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); if (!required && zio_injection_enabled) required = !!zio_handle_device_injection(vd, NULL, ECHILD); return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. * Returns the spacemap object, or 0 if it wasn't in the ZAP * or the ZAP doesn't exist yet. */ int vdev_checkpoint_sm_object(vdev_t *vd) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { return (0); } uint64_t sm_obj = 0; int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj); ASSERT(err == 0 || err == ENOENT); return (sm_obj); } int vdev_load(vdev_t *vd) { int error = 0; /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { error = vdev_load(vd->vdev_child[c]); if (error != 0) { return (error); } } vdev_set_deflate_ratio(vd); /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } else if ((error = vdev_metaslab_init(vd, 0)) != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd); if (checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift))) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); space_map_update(vd->vdev_checkpoint_sm); /* * Since the checkpoint_sm contains free entries * exclusively we can use sm_alloc to indicate the * culmulative checkpointed space that has been freed. */ vd->vdev_stat.vs_checkpoint_space = -vd->vdev_checkpoint_sm->sm_alloc; vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); if (obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } space_map_update(vd->vdev_obsolete_sm); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vd->vdev_ms_array = 0; } static void vdev_remove_empty(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; dmu_tx_t *tx; ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp == NULL || msp->ms_sm == NULL) continue; mutex_enter(&msp->ms_lock); /* * If the metaslab was not loaded when the vdev * was removed then the histogram accounting may * not be accurate. Update the histogram information * here so that we ensure that the metaslab group * and metaslab class are up-to-date. */ metaslab_group_histogram_remove(mg, msp); VERIFY0(space_map_allocated(msp->ms_sm)); space_map_close(msp->ms_sm); msp->ms_sm = NULL; mutex_exit(&msp->ms_lock); } if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_has_checkpoint(spa)); space_map_close(vd->vdev_checkpoint_sm); vd->vdev_checkpoint_sm = NULL; } metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); } tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_islog && vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) metaslab_sync_reassess(vd->vdev_mg); } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; dmu_tx_t *tx; if (range_tree_space(vd->vdev_obsolete_segments) > 0) { dmu_tx_t *tx; ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vdev_indirect_sync_obsolete(vd, tx); dmu_tx_commit(tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); dmu_tx_commit(tx); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * Remove the metadata associated with this vdev once it's empty. * Note that this is typically used for log/cache device removal; * we don't empty toplevel vdevs when removing them. But if * a toplevel happens to be emptied, this is not harmful. */ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) { vdev_remove_empty(vd, txg); } (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc, !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); if (vd == rvd) { for (int c = 0; c < spa->spa_l2cache.sav_count; c++) vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); for (int c = 0; c < spa->spa_spares.sav_count; c++) vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); } /* * It makes no sense to "clear" an indirect vdev. */ if (!vdev_is_concrete(vd)) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in reponse to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) spa_async_request(spa, SPA_ASYNC_RESILVER); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { /* * Assuming 47 bits of the space map entry dedicated for the entry's * offset (see description in space_map.h), we calculate the maximum * address that can be described by a space map entry for the given * device. */ uint64_t shift = vd->vdev_ashift + 47; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd = vd->vdev_top; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report intializing progress. Since we don't have the * initializing locks held, this is only an estimate (although a * fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; } /* * Report expandable space on top-level, non-auxillary devices only. * The expandable space is reported in terms of metaslab sized units * since that determines how much space the pool can expand. */ if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; vs->vs_physical_ashift = vd->vdev_physical_ashift; if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; } /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (vd == rvd) { for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *cvd = rvd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; for (int t = 0; t < ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } } mutex_exit(&vd->vdev_stat_lock); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_phys_t *scn_phys = &spa->spa_dsl_pool->dp_scan->scn_phys; uint64_t *processed = &scn_phys->scn_processed; /* XXX cleanup? */ if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } vs->vs_ops[type]++; vs->vs_bytes[type] += psize; mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; mutex_enter(&vd->vdev_stat_lock); if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); if (spa->spa_load_state == SPA_LOAD_NONE && type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { int64_t dspace_delta = space_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; metaslab_group_t *mg = vd->vdev_mg; metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * childrens', thus not accurate enough for us. */ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); if (mc == spa_normal_class(spa)) { mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } if (mc != NULL) { ASSERT(rvd == vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, dspace_delta); } } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } zfs_ereport_post(class, spa, vd, NULL, save_state, 0); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify the fmd of the state change. Be verbose and post * notifications even for stuff that's not important; the fmd agent can * sort it out. Don't emit state change events for non-leaf vdevs since * they can't change state on their own. The FMD can check their state * if it wants to when it sees that a leaf vdev had a state change. */ if (vd->vdev_ops->vdev_op_leaf) zfs_post_state_change(spa, vd); if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. * In addition, only a single top-level vdev is allowed. * * FreeBSD does not have above limitations. */ boolean_t vdev_is_bootable(vdev_t *vd) { #ifdef illumos if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && vd->vdev_children > 1) { return (B_FALSE); } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { return (B_FALSE); } } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } #endif /* illumos */ return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; vdev_remove_child(pvd, vd); vdev_compact_children(pvd); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime we panic the system. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) { vdev_dbgmsg(vd, "SLOW IO: zio timestamp " "%lluns, delta %lluns, last io %lluns", fio->io_timestamp, (u_longlong_t)delta, vq->vq_io_complete_ts); fm_panic("I/O to pool '%s' appears to be " "hung on vdev guid %llu at '%s'.", spa_name(spa), (long long unsigned int) vd->vdev_guid, vd->vdev_path); } } mutex_exit(&vq->vq_lock); } } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c (revision 344935) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c (revision 344936) @@ -1,300 +1,303 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include /* * Virtual device vector for files. */ static taskq_t *vdev_file_taskq; void vdev_file_init(void) { vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), minclsyspri, max_ncpus, INT_MAX, 0); } void vdev_file_fini(void) { taskq_destroy(vdev_file_taskq); } static void vdev_file_hold(vdev_t *vd) { ASSERT(vd->vdev_path != NULL); } static void vdev_file_rele(vdev_t *vd) { ASSERT(vd->vdev_path != NULL); } static int vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_file_t *vf; vnode_t *vp; vattr_t vattr; int error; + /* Rotational optimizations only make sense on block devices */ + vd->vdev_nonrot = B_TRUE; + /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (vd->vdev_tsd != NULL) { ASSERT(vd->vdev_reopening); vf = vd->vdev_tsd; vp = vf->vf_vnode; goto skip_open; } vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); /* * We always open the files from the root of the global zone, even if * we're in a local zone. If the user has gotten to this point, the * administrator has already decided that the pool should be available * to local zone users, so the underlying devices should be as well. */ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; return (error); } vf->vf_vnode = vp; #ifdef _KERNEL /* * Make sure it's a regular file. */ if (vp->v_type != VREG) { #ifdef __FreeBSD__ (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); #endif vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; #ifdef __FreeBSD__ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; #endif return (SET_ERROR(ENODEV)); } #endif /* _KERNEL */ skip_open: /* * Determine the physical size of the file. */ vattr.va_mask = AT_SIZE; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, kcred); VOP_UNLOCK(vp, 0); if (error) { (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; return (error); } vd->vdev_notrim = B_TRUE; *max_psize = *psize = vattr.va_size; *logical_ashift = SPA_MINBLOCKSHIFT; *physical_ashift = SPA_MINBLOCKSHIFT; return (0); } static void vdev_file_close(vdev_t *vd) { vdev_file_t *vf = vd->vdev_tsd; if (vd->vdev_reopening || vf == NULL) return; if (vf->vf_vnode != NULL) { (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); } vd->vdev_delayed_close = B_FALSE; kmem_free(vf, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; } /* * Implements the interrupt side for file vdev types. This routine will be * called when the I/O completes allowing us to transfer the I/O to the * interrupt taskqs. For consistency, the code structure mimics disk vdev * types. */ static void vdev_file_io_intr(zio_t *zio) { zio_delay_interrupt(zio); } static void vdev_file_io_strategy(void *arg) { zio_t *zio = arg; vdev_t *vd = zio->io_vd; vdev_file_t *vf; vnode_t *vp; void *addr; ssize_t resid; vf = vd->vdev_tsd; vp = vf->vf_vnode; ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); if (zio->io_type == ZIO_TYPE_READ) { addr = abd_borrow_buf(zio->io_abd, zio->io_size); } else { addr = abd_borrow_buf_copy(zio->io_abd, zio->io_size); } zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? UIO_READ : UIO_WRITE, vp, addr, zio->io_size, zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); if (zio->io_type == ZIO_TYPE_READ) { abd_return_buf_copy(zio->io_abd, addr, zio->io_size); } else { abd_return_buf(zio->io_abd, addr, zio->io_size); } if (resid != 0 && zio->io_error == 0) zio->io_error = ENOSPC; vdev_file_io_intr(zio); } static void vdev_file_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, kcred, NULL); break; default: zio->io_error = SET_ERROR(ENOTSUP); } zio_execute(zio); return; } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); zio->io_target_timestamp = zio_handle_io_delay(zio); VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, TQ_SLEEP), !=, 0); } /* ARGSUSED */ static void vdev_file_io_done(zio_t *zio) { } vdev_ops_t vdev_file_ops = { vdev_file_open, vdev_file_close, vdev_default_asize, vdev_file_io_start, vdev_file_io_done, NULL, NULL, vdev_file_hold, vdev_file_rele, NULL, vdev_default_xlate, VDEV_TYPE_FILE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; /* * From userland we access disks just like files. */ #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { vdev_file_open, vdev_file_close, vdev_default_asize, vdev_file_io_start, vdev_file_io_done, NULL, NULL, vdev_file_hold, vdev_file_rele, NULL, vdev_default_xlate, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; #endif Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 344935) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 344936) @@ -1,1165 +1,1165 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006 Pawel Jakub Dawidek * All rights reserved. * * Portions Copyright (c) 2012 Martin Matuska */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual device vector for GEOM. */ static g_attrchanged_t vdev_geom_attrchanged; struct g_class zfs_vdev_class = { .name = "ZFS::VDEV", .version = G_VERSION, .attrchanged = vdev_geom_attrchanged, }; struct consumer_vdev_elem { SLIST_ENTRY(consumer_vdev_elem) elems; vdev_t *vd; }; SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); _Static_assert(sizeof(((struct g_consumer*)NULL)->private) == sizeof(struct consumer_priv_t*), "consumer_priv_t* can't be stored in g_consumer.private"); DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); SYSCTL_DECL(_vfs_zfs_vdev); /* Don't send BIO_FLUSH. */ static int vdev_geom_bio_flush_disable; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); /* Don't send BIO_DELETE. */ static int vdev_geom_bio_delete_disable; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); /* Declare local functions */ static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); /* * Thread local storage used to indicate when a thread is probing geoms * for their guids. If NULL, this thread is not tasting geoms. If non NULL, * it is looking for a replacement for the vdev_t* that is its value. */ uint_t zfs_geom_probe_vdev_key; static void vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp) { int error; uint16_t rate; error = g_getattr("GEOM::rotation_rate", cp, &rate); - if (error == 0) - vd->vdev_rotation_rate = rate; + if (error == 0 && rate == 1) + vd->vdev_nonrot = B_TRUE; else - vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN; + vd->vdev_nonrot = B_FALSE; } static void vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, boolean_t do_null_update) { boolean_t needs_update = B_FALSE; char *physpath; int error, physpath_len; physpath_len = MAXPATHLEN; physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); if (error == 0) { char *old_physpath; /* g_topology lock ensures that vdev has not been closed */ g_topology_assert(); old_physpath = vd->vdev_physpath; vd->vdev_physpath = spa_strdup(physpath); if (old_physpath != NULL) { needs_update = (strcmp(old_physpath, vd->vdev_physpath) != 0); spa_strfree(old_physpath); } else needs_update = do_null_update; } g_free(physpath); /* * If the physical path changed, update the config. * Only request an update for previously unset physpaths if * requested by the caller. */ if (needs_update) spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); } static void vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) { char *old_physpath; struct consumer_priv_t *priv; struct consumer_vdev_elem *elem; int error; priv = (struct consumer_priv_t*)&cp->private; if (SLIST_EMPTY(priv)) return; SLIST_FOREACH(elem, priv, elems) { vdev_t *vd = elem->vd; if (strcmp(attr, "GEOM::rotation_rate") == 0) { vdev_geom_set_rotation_rate(vd, cp); return; } if (strcmp(attr, "GEOM::physpath") == 0) { vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE); return; } } } static void vdev_geom_orphan(struct g_consumer *cp) { struct consumer_priv_t *priv; struct consumer_vdev_elem *elem; g_topology_assert(); priv = (struct consumer_priv_t*)&cp->private; if (SLIST_EMPTY(priv)) /* Vdev close in progress. Ignore the event. */ return; /* * Orphan callbacks occur from the GEOM event thread. * Concurrent with this call, new I/O requests may be * working their way through GEOM about to find out * (only once executed by the g_down thread) that we've * been orphaned from our disk provider. These I/Os * must be retired before we can detach our consumer. * This is most easily achieved by acquiring the * SPA ZIO configuration lock as a writer, but doing * so with the GEOM topology lock held would cause * a lock order reversal. Instead, rely on the SPA's * async removal support to invoke a close on this * vdev once it is safe to do so. */ SLIST_FOREACH(elem, priv, elems) { vdev_t *vd = elem->vd; vd->vdev_remove_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); } } static struct g_consumer * vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) { struct g_geom *gp; struct g_consumer *cp; int error; g_topology_assert(); ZFS_LOG(1, "Attaching to %s.", pp->name); if (sanity) { if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { ZFS_LOG(1, "Failing attach of %s. " "Incompatible sectorsize %d\n", pp->name, pp->sectorsize); return (NULL); } else if (pp->mediasize < SPA_MINDEVSIZE) { ZFS_LOG(1, "Failing attach of %s. " "Incompatible mediasize %ju\n", pp->name, pp->mediasize); return (NULL); } } /* Do we have geom already? No? Create one. */ LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; if (strcmp(gp->name, "zfs::vdev") != 0) continue; break; } if (gp == NULL) { gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); gp->orphan = vdev_geom_orphan; gp->attrchanged = vdev_geom_attrchanged; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } error = g_access(cp, 1, 0, 1); if (error != 0) { ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); } else { /* Check if we are already connected to this provider. */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) { ZFS_LOG(1, "Found consumer for %s.", pp->name); break; } } if (cp == NULL) { cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } error = g_access(cp, 1, 0, 1); if (error != 0) { ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } ZFS_LOG(1, "Created consumer for %s.", pp->name); } else { error = g_access(cp, 1, 0, 1); if (error != 0) { ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, __LINE__, error); return (NULL); } ZFS_LOG(1, "Used existing consumer for %s.", pp->name); } } if (vd != NULL) vd->vdev_tsd = cp; cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; return (cp); } static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) { struct g_geom *gp; g_topology_assert(); ZFS_LOG(1, "Detaching from %s.", cp->provider && cp->provider->name ? cp->provider->name : "NULL"); gp = cp->geom; if (open_for_read) g_access(cp, -1, 0, -1); /* Destroy consumer on last close. */ if (cp->acr == 0 && cp->ace == 0) { if (cp->acw > 0) g_access(cp, 0, -cp->acw, 0); if (cp->provider != NULL) { ZFS_LOG(1, "Destroying consumer for %s.", cp->provider->name ? cp->provider->name : "NULL"); g_detach(cp); } g_destroy_consumer(cp); } /* Destroy geom if there are no consumers left. */ if (LIST_EMPTY(&gp->consumer)) { ZFS_LOG(1, "Destroyed geom %s.", gp->name); g_wither_geom(gp, ENXIO); } } static void vdev_geom_close_locked(vdev_t *vd) { struct g_consumer *cp; struct consumer_priv_t *priv; struct consumer_vdev_elem *elem, *elem_temp; g_topology_assert(); cp = vd->vdev_tsd; vd->vdev_delayed_close = B_FALSE; if (cp == NULL) return; ZFS_LOG(1, "Closing access to %s.", cp->provider->name); KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); priv = (struct consumer_priv_t*)&cp->private; vd->vdev_tsd = NULL; SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { if (elem->vd == vd) { SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); g_free(elem); } } vdev_geom_detach(cp, B_TRUE); } /* * Issue one or more bios to the vdev in parallel * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO * operation is described by parallel entries from each array. There may be * more bios actually issued than entries in the array */ static void vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, off_t *sizes, int *errors, int ncmds) { struct bio **bios; u_char *p; off_t off, maxio, s, end; int i, n_bios, j; size_t bios_size; maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); n_bios = 0; /* How many bios are required for all commands ? */ for (i = 0; i < ncmds; i++) n_bios += (sizes[i] + maxio - 1) / maxio; /* Allocate memory for the bios */ bios_size = n_bios * sizeof(struct bio*); bios = kmem_zalloc(bios_size, KM_SLEEP); /* Prepare and issue all of the bios */ for (i = j = 0; i < ncmds; i++) { off = offsets[i]; p = datas[i]; s = sizes[i]; end = off + s; ASSERT((off % cp->provider->sectorsize) == 0); ASSERT((s % cp->provider->sectorsize) == 0); for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { bios[j] = g_alloc_bio(); bios[j]->bio_cmd = cmds[i]; bios[j]->bio_done = NULL; bios[j]->bio_offset = off; bios[j]->bio_length = MIN(s, maxio); bios[j]->bio_data = p; g_io_request(bios[j], cp); } } ASSERT(j == n_bios); /* Wait for all of the bios to complete, and clean them up */ for (i = j = 0; i < ncmds; i++) { off = offsets[i]; s = sizes[i]; end = off + s; for (; off < end; off += maxio, s -= maxio, j++) { errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i]; g_destroy_bio(bios[j]); } } kmem_free(bios, bios_size); } /* * Read the vdev config from a device. Return the number of valid labels that * were found. The vdev config will be returned in config if and only if at * least one valid label was found. */ static int vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) { struct g_provider *pp; nvlist_t *config; vdev_phys_t *vdev_lists[VDEV_LABELS]; char *buf; size_t buflen; uint64_t psize, state, txg; off_t offsets[VDEV_LABELS]; off_t size; off_t sizes[VDEV_LABELS]; int cmds[VDEV_LABELS]; int errors[VDEV_LABELS]; int l, nlabels; g_topology_assert_not(); pp = cp->provider; ZFS_LOG(1, "Reading config from %s...", pp->name); psize = pp->mediasize; psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t)); size = sizeof(*vdev_lists[0]) + pp->sectorsize - ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1; buflen = sizeof(vdev_lists[0]->vp_nvlist); /* Create all of the IO requests */ for (l = 0; l < VDEV_LABELS; l++) { cmds[l] = BIO_READ; vdev_lists[l] = kmem_alloc(size, KM_SLEEP); offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; sizes[l] = size; errors[l] = 0; ASSERT(offsets[l] % pp->sectorsize == 0); } /* Issue the IO requests */ vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, VDEV_LABELS); /* Parse the labels */ config = *configp = NULL; nlabels = 0; for (l = 0; l < VDEV_LABELS; l++) { if (errors[l] != 0) continue; buf = vdev_lists[l]->vp_nvlist; if (nvlist_unpack(buf, buflen, &config, 0) != 0) continue; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state > POOL_STATE_L2CACHE) { nvlist_free(config); continue; } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { nvlist_free(config); continue; } if (*configp != NULL) nvlist_free(*configp); *configp = config; nlabels++; } /* Free the label storage */ for (l = 0; l < VDEV_LABELS; l++) kmem_free(vdev_lists[l], size); return (nlabels); } static void resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) { nvlist_t **new_configs; uint64_t i; if (id < *count) return; new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *), KM_SLEEP); for (i = 0; i < *count; i++) new_configs[i] = (*configs)[i]; if (*configs != NULL) kmem_free(*configs, *count * sizeof(void *)); *configs = new_configs; *count = id + 1; } static void process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, const char *name, uint64_t* known_pool_guid) { nvlist_t *vdev_tree; uint64_t pool_guid; uint64_t vdev_guid, known_guid; uint64_t id, txg, known_txg; char *pname; int i; if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || strcmp(pname, name) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) goto ignore; if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) goto ignore; if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) goto ignore; VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if (*known_pool_guid != 0) { if (pool_guid != *known_pool_guid) goto ignore; } else *known_pool_guid = pool_guid; resize_configs(configs, count, id); if ((*configs)[id] != NULL) { VERIFY(nvlist_lookup_uint64((*configs)[id], ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); if (txg <= known_txg) goto ignore; nvlist_free((*configs)[id]); } (*configs)[id] = cfg; return; ignore: nvlist_free(cfg); } int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, uint64_t *count) { struct g_class *mp; struct g_geom *gp; struct g_provider *pp; struct g_consumer *zcp; nvlist_t *vdev_cfg; uint64_t pool_guid; int error, nlabels; DROP_GIANT(); g_topology_lock(); *configs = NULL; *count = 0; pool_guid = 0; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->flags & G_PF_WITHER) continue; zcp = vdev_geom_attach(pp, NULL, B_TRUE); if (zcp == NULL) continue; g_topology_unlock(); nlabels = vdev_geom_read_config(zcp, &vdev_cfg); g_topology_lock(); vdev_geom_detach(zcp, B_TRUE); if (nlabels == 0) continue; ZFS_LOG(1, "successfully read vdev config"); process_vdev_config(configs, count, vdev_cfg, name, &pool_guid); } } } g_topology_unlock(); PICKUP_GIANT(); return (*count > 0 ? 0 : ENOENT); } enum match { NO_MATCH = 0, /* No matching labels found */ TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid*/ ZERO_MATCH = 1, /* Should never be returned */ ONE_MATCH = 2, /* 1 label matching the vdev_guid */ TWO_MATCH = 3, /* 2 label matching the vdev_guid */ THREE_MATCH = 4, /* 3 label matching the vdev_guid */ FULL_MATCH = 5 /* all labels match the vdev_guid */ }; static enum match vdev_attach_ok(vdev_t *vd, struct g_provider *pp) { nvlist_t *config; uint64_t pool_guid, top_guid, vdev_guid; struct g_consumer *cp; int nlabels; cp = vdev_geom_attach(pp, NULL, B_TRUE); if (cp == NULL) { ZFS_LOG(1, "Unable to attach tasting instance to %s.", pp->name); return (NO_MATCH); } g_topology_unlock(); nlabels = vdev_geom_read_config(cp, &config); g_topology_lock(); vdev_geom_detach(cp, B_TRUE); if (nlabels == 0) { ZFS_LOG(1, "Unable to read config from %s.", pp->name); return (NO_MATCH); } pool_guid = 0; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); top_guid = 0; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); vdev_guid = 0; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); nvlist_free(config); /* * Check that the label's pool guid matches the desired guid. * Inactive spares and L2ARCs do not have any pool guid in the label. */ if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", pp->name, (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); return (NO_MATCH); } /* * Check that the label's vdev guid matches the desired guid. * The second condition handles possible race on vdev detach, when * remaining vdev receives GUID of destroyed top level mirror vdev. */ if (vdev_guid == vd->vdev_guid) { ZFS_LOG(1, "guids match for provider %s.", pp->name); return (ZERO_MATCH + nlabels); } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); return (TOPGUID_MATCH); } ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); return (NO_MATCH); } static struct g_consumer * vdev_geom_attach_by_guids(vdev_t *vd) { struct g_class *mp; struct g_geom *gp; struct g_provider *pp, *best_pp; struct g_consumer *cp; const char *vdpath; enum match match, best_match; g_topology_assert(); vdpath = vd->vdev_path + sizeof("/dev/") - 1; cp = NULL; best_pp = NULL; best_match = NO_MATCH; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { match = vdev_attach_ok(vd, pp); if (match > best_match) { best_match = match; best_pp = pp; } else if (match == best_match) { if (strcmp(pp->name, vdpath) == 0) { best_pp = pp; } } if (match == FULL_MATCH) goto out; } } } out: if (best_pp) { cp = vdev_geom_attach(best_pp, vd, B_TRUE); if (cp == NULL) { printf("ZFS WARNING: Unable to attach to %s.\n", best_pp->name); } } return (cp); } static struct g_consumer * vdev_geom_open_by_guids(vdev_t *vd) { struct g_consumer *cp; char *buf; size_t len; g_topology_assert(); ZFS_LOG(1, "Searching by guids [%ju:%ju].", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); cp = vdev_geom_attach_by_guids(vd); if (cp != NULL) { len = strlen(cp->provider->name) + strlen("/dev/") + 1; buf = kmem_alloc(len, KM_SLEEP); snprintf(buf, len, "/dev/%s", cp->provider->name); spa_strfree(vd->vdev_path); vd->vdev_path = buf; ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid, cp->provider->name); } else { ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); } return (cp); } static struct g_consumer * vdev_geom_open_by_path(vdev_t *vd, int check_guid) { struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); cp = NULL; pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); if (pp != NULL) { ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) cp = vdev_geom_attach(pp, vd, B_FALSE); } return (cp); } static int vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { struct g_provider *pp; struct g_consumer *cp; size_t bufsize; int error; /* Set the TLS to indicate downstack that we should not access zvols*/ VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0); /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if ((cp = vd->vdev_tsd) != NULL) { ASSERT(vd->vdev_reopening); goto skip_open; } DROP_GIANT(); g_topology_lock(); error = 0; if (vd->vdev_spa->spa_splitting_newspa || (vd->vdev_prevstate == VDEV_STATE_UNKNOWN && vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) { /* * We are dealing with a vdev that hasn't been previously * opened (since boot), and we are not loading an * existing pool configuration. This looks like a * vdev add operation to a new or existing pool. * Assume the user knows what he/she is doing and find * GEOM provider by its name, ignoring GUID mismatches. * * XXPOLICY: It would be safer to only allow a device * that is unlabeled or labeled but missing * GUID information to be opened in this fashion, * unless we are doing a split, in which case we * should allow any guid. */ cp = vdev_geom_open_by_path(vd, 0); } else { /* * Try using the recorded path for this device, but only * accept it if its label data contains the expected GUIDs. */ cp = vdev_geom_open_by_path(vd, 1); if (cp == NULL) { /* * The device at vd->vdev_path doesn't have the * expected GUIDs. The disks might have merely * moved around so try all other GEOM providers * to find one with the right GUIDs. */ cp = vdev_geom_open_by_guids(vd); } } /* Clear the TLS now that tasting is done */ VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0); if (cp == NULL) { ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); error = ENOENT; } else { struct consumer_priv_t *priv; struct consumer_vdev_elem *elem; int spamode; priv = (struct consumer_priv_t*)&cp->private; if (cp->private == NULL) SLIST_INIT(priv); elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO); elem->vd = vd; SLIST_INSERT_HEAD(priv, elem, elems); spamode = spa_mode(vd->vdev_spa); if (cp->provider->sectorsize > VDEV_PAD_SIZE || !ISP2(cp->provider->sectorsize)) { ZFS_LOG(1, "Provider %s has unsupported sectorsize.", cp->provider->name); vdev_geom_close_locked(vd); error = EINVAL; cp = NULL; } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { int i; for (i = 0; i < 5; i++) { error = g_access(cp, 0, 1, 0); if (error == 0) break; g_topology_unlock(); tsleep(vd, 0, "vdev", hz / 2); g_topology_lock(); } if (error != 0) { printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n", cp->provider->name, error); vdev_geom_close_locked(vd); cp = NULL; } } } /* Fetch initial physical path information for this device. */ if (cp != NULL) { vdev_geom_attrchanged(cp, "GEOM::physpath"); /* Set other GEOM characteristics */ vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE); vdev_geom_set_rotation_rate(vd, cp); } g_topology_unlock(); PICKUP_GIANT(); if (cp == NULL) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", error); return (error); } skip_open: pp = cp->provider; /* * Determine the actual size of the device. */ *max_psize = *psize = pp->mediasize; /* * Determine the device's minimum transfer size and preferred * transfer size. */ *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; *physical_ashift = 0; if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) && pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0) *physical_ashift = highbit(pp->stripesize) - 1; /* * Clear the nowritecache settings, so that on a vdev_reopen() * we will try again. */ vd->vdev_nowritecache = B_FALSE; return (0); } static void vdev_geom_close(vdev_t *vd) { struct g_consumer *cp; cp = vd->vdev_tsd; DROP_GIANT(); g_topology_lock(); if (!vd->vdev_reopening || (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || (cp->provider != NULL && cp->provider->error != 0)))) vdev_geom_close_locked(vd); g_topology_unlock(); PICKUP_GIANT(); } static void vdev_geom_io_intr(struct bio *bp) { vdev_t *vd; zio_t *zio; zio = bp->bio_caller1; vd = zio->io_vd; zio->io_error = bp->bio_error; if (zio->io_error == 0 && bp->bio_resid != 0) zio->io_error = SET_ERROR(EIO); switch(zio->io_error) { case ENOTSUP: /* * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know * that future attempts will never succeed. In this case * we set a persistent flag so that we don't bother with * requests in the future. */ switch(bp->bio_cmd) { case BIO_FLUSH: vd->vdev_nowritecache = B_TRUE; break; case BIO_DELETE: vd->vdev_notrim = B_TRUE; break; } break; case ENXIO: if (!vd->vdev_remove_wanted) { /* * If provider's error is set we assume it is being * removed. */ if (bp->bio_to->error != 0) { vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } else if (!vd->vdev_delayed_close) { vd->vdev_delayed_close = B_TRUE; } } break; } /* * We have to split bio freeing into two parts, because the ABD code * cannot be called in this context and vdev_op_io_done is not called * for ZIO_TYPE_IOCTL zio-s. */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { g_destroy_bio(bp); zio->io_bio = NULL; } zio_delay_interrupt(zio); } static void vdev_geom_io_start(zio_t *zio) { vdev_t *vd; struct g_consumer *cp; struct bio *bp; int error; vd = zio->io_vd; switch (zio->io_type) { case ZIO_TYPE_IOCTL: /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } else { switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush || vdev_geom_bio_flush_disable) break; if (vd->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); break; } goto sendreq; default: zio->io_error = SET_ERROR(ENOTSUP); } } zio_execute(zio); return; case ZIO_TYPE_FREE: if (vd->vdev_notrim) { zio->io_error = SET_ERROR(ENOTSUP); } else if (!vdev_geom_bio_delete_disable) { goto sendreq; } zio_execute(zio); return; } sendreq: ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE || zio->io_type == ZIO_TYPE_IOCTL); cp = vd->vdev_tsd; if (cp == NULL) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } bp = g_alloc_bio(); bp->bio_caller1 = zio; switch (zio->io_type) { case ZIO_TYPE_READ: case ZIO_TYPE_WRITE: zio->io_target_timestamp = zio_handle_io_delay(zio); bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; if (zio->io_type == ZIO_TYPE_READ) { bp->bio_cmd = BIO_READ; bp->bio_data = abd_borrow_buf(zio->io_abd, zio->io_size); } else { bp->bio_cmd = BIO_WRITE; bp->bio_data = abd_borrow_buf_copy(zio->io_abd, zio->io_size); } break; case ZIO_TYPE_FREE: bp->bio_cmd = BIO_DELETE; bp->bio_data = NULL; bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; case ZIO_TYPE_IOCTL: bp->bio_cmd = BIO_FLUSH; bp->bio_data = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; break; } bp->bio_done = vdev_geom_io_intr; zio->io_bio = bp; g_io_request(bp, cp); } static void vdev_geom_io_done(zio_t *zio) { struct bio *bp = zio->io_bio; if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { ASSERT(bp == NULL); return; } if (bp == NULL) { ASSERT3S(zio->io_error, ==, ENXIO); return; } if (zio->io_type == ZIO_TYPE_READ) abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size); else abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size); g_destroy_bio(bp); zio->io_bio = NULL; } static void vdev_geom_hold(vdev_t *vd) { } static void vdev_geom_rele(vdev_t *vd) { } vdev_ops_t vdev_geom_ops = { vdev_geom_open, vdev_geom_close, vdev_default_asize, vdev_geom_io_start, vdev_geom_io_done, NULL, NULL, vdev_geom_hold, vdev_geom_rele, NULL, vdev_default_xlate, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c (revision 344935) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c (revision 344936) @@ -1,778 +1,778 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * Virtual device vector for mirroring. */ typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; int mc_error; int mc_load; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; } mirror_child_t; typedef struct mirror_map { int *mm_preferred; int mm_preferred_cnt; int mm_children; boolean_t mm_resilvering; boolean_t mm_root; mirror_child_t mm_child[]; } mirror_map_t; static int vdev_mirror_shift = 21; #ifdef _KERNEL SYSCTL_DECL(_vfs_zfs_vdev); static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, "ZFS VDEV Mirror"); #endif /* * The load configuration settings below are tuned by default for * the case where all devices are of the same rotational type. * * If there is a mixture of rotating and non-rotating media, setting * non_rotating_seek_inc to 0 may well provide better results as it * will direct more reads to the non-rotating vdevs which are more * likely to have a higher performance. */ /* Rotating media load calculation configuration. */ static int rotating_inc = 0; #ifdef _KERNEL SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RWTUN, &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's"); #endif static int rotating_seek_inc = 5; #ifdef _KERNEL SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RWTUN, &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's"); #endif static int rotating_seek_offset = 1 * 1024 * 1024; #ifdef _KERNEL SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RWTUN, &rotating_seek_offset, 0, "Offset in bytes from the last I/O which " "triggers a reduced rotating media seek increment"); #endif /* Non-rotating media load calculation configuration. */ static int non_rotating_inc = 0; #ifdef _KERNEL SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RWTUN, &non_rotating_inc, 0, "Non-rotating media load increment for non-seeking I/O's"); #endif static int non_rotating_seek_inc = 1; #ifdef _KERNEL SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RWTUN, &non_rotating_seek_inc, 0, "Non-rotating media load increment for seeking I/O's"); #endif static inline size_t vdev_mirror_map_size(int children) { return (offsetof(mirror_map_t, mm_child[children]) + sizeof(int) * children); } static inline mirror_map_t * vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) { mirror_map_t *mm; mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); mm->mm_children = children; mm->mm_resilvering = resilvering; mm->mm_root = root; mm->mm_preferred = (int *)((uintptr_t)mm + offsetof(mirror_map_t, mm_child[children])); return mm; } static void vdev_mirror_map_free(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } static const zio_vsd_ops_t vdev_mirror_vsd_ops = { vdev_mirror_map_free, zio_vsd_default_cksum_report }; static int vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) { uint64_t lastoffset; int load; /* All DVAs have equal weight at the root. */ if (mm->mm_root) return (INT_MAX); /* * We don't return INT_MAX if the device is resilvering i.e. * vdev_resilver_txg != 0 as when tested performance was slightly * worse overall when resilvering with compared to without. */ /* Standard load based on pending queue length. */ load = vdev_queue_length(vd); lastoffset = vdev_queue_lastoffset(vd); - if (vd->vdev_rotation_rate == VDEV_RATE_NON_ROTATING) { + if (vd->vdev_nonrot) { /* Non-rotating media. */ if (lastoffset == zio_offset) return (load + non_rotating_inc); /* * Apply a seek penalty even for non-rotating devices as * sequential I/O'a can be aggregated into fewer operations * on the device, thus avoiding unnecessary per-command * overhead and boosting performance. */ return (load + non_rotating_seek_inc); } /* Rotating media I/O's which directly follow the last I/O. */ if (lastoffset == zio_offset) return (load + rotating_inc); /* * Apply half the seek increment to I/O's within seek offset * of the last I/O queued to this vdev as they should incure less * of a seek increment. */ if (ABS(lastoffset - zio_offset) < rotating_seek_offset) return (load + (rotating_seek_inc / 2)); /* Apply the full seek increment to all other I/O's. */ return (load + rotating_seek_inc); } static mirror_map_t * vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; dva_t dva_copy[SPA_DVAS_PER_BP]; c = BP_GET_NDVAS(zio->io_bp); /* * If we do not trust the pool config, some DVAs might be * invalid or point to vdevs that do not exist. We skip them. */ if (!spa_trust_config(spa)) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); int j = 0; for (int i = 0; i < c; i++) { if (zfs_dva_valid(spa, &dva[i], zio->io_bp)) dva_copy[j++] = dva[i]; } if (j == 0) { zio->io_vsd = NULL; zio->io_error = ENXIO; return (NULL); } if (j < c) { dva = dva_copy; c = j; } } mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { /* * If we are resilvering, then we should handle scrub reads * differently; we shouldn't issue them to the resilvering * device because it might not have those blocks. * * We are resilvering iff: * 1) We are a replacing vdev (ie our name is "replacing-1" or * "spare-1" or something like that), and * 2) The pool is currently being resilvered. * * We cannot simply check vd->vdev_resilver_txg, because it's * not set in this path. * * Nor can we just check our vdev_ops; there are cases (such as * when a user types "zpool replace pool odev spare_dev" and * spare_dev is in the spare list, or when a spare device is * automatically used to replace a DEGRADED device) when * resilvering is complete but both the original vdev and the * spare vdev remain in the pool. That behavior is intentional. * It helps implement the policy that a spare should be * automatically removed from the pool after the user replaces * the device that originally failed. * * If a spa load is in progress, then spa_dsl_pool may be * uninitialized. But we shouldn't be resilvering during a spa * load anyway. */ boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops) && spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); } static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { int numerrors = 0; int lasterror = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } vdev_open_children(vd); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error) { lasterror = cvd->vdev_open_error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); *physical_ashift = MAX(*physical_ashift, cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { if (vdev_children_are_offline(vd)) vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE; else vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } return (0); } static void vdev_mirror_close(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } static void vdev_mirror_child_done(zio_t *zio) { mirror_child_t *mc = zio->io_private; mc->mc_error = zio->io_error; mc->mc_tried = 1; mc->mc_skipped = 0; } static void vdev_mirror_scrub_done(zio_t *zio) { mirror_child_t *mc = zio->io_private; if (zio->io_error == 0) { zio_t *pio; zio_link_t *zl = NULL; mutex_enter(&zio->io_lock); while ((pio = zio_walk_parents(zio, &zl)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); abd_copy(pio->io_abd, zio->io_abd, pio->io_size); mutex_exit(&pio->io_lock); } mutex_exit(&zio->io_lock); } abd_free(zio->io_abd); mc->mc_error = zio->io_error; mc->mc_tried = 1; mc->mc_skipped = 0; } /* * Check the other, lower-index DVAs to see if they're on the same * vdev as the child we picked. If they are, use them since they * are likely to have been allocated from the primary metaslab in * use at the time, and hence are more likely to have locality with * single-copy data. */ static int vdev_mirror_dva_select(zio_t *zio, int p) { dva_t *dva = zio->io_bp->blk_dva; mirror_map_t *mm = zio->io_vsd; int preferred; int c; preferred = mm->mm_preferred[p]; for (p-- ; p >= 0; p--) { c = mm->mm_preferred[p]; if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) preferred = c; } return (preferred); } static int vdev_mirror_preferred_child_randomize(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; int p; if (mm->mm_root) { p = spa_get_random(mm->mm_preferred_cnt); return (vdev_mirror_dva_select(zio, p)); } /* * To ensure we don't always favour the first matching vdev, * which could lead to wear leveling issues on SSD's, we * use the I/O offset as a pseudo random seed into the vdevs * which have the lowest load. */ p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; return (mm->mm_preferred[p]); } /* * Try to find a vdev whose DTL doesn't contain the block we want to read * prefering vdevs based on determined load. * * If we can't, try the read on any vdev we haven't already tried. */ static int vdev_mirror_child_select(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; uint64_t txg = zio->io_txg; int c, lowest_load; ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); lowest_load = INT_MAX; mm->mm_preferred_cnt = 0; for (c = 0; c < mm->mm_children; c++) { mirror_child_t *mc; mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; if (!vdev_readable(mc->mc_vd)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; continue; } mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); if (mc->mc_load > lowest_load) continue; if (mc->mc_load < lowest_load) { lowest_load = mc->mc_load; mm->mm_preferred_cnt = 0; } mm->mm_preferred[mm->mm_preferred_cnt] = c; mm->mm_preferred_cnt++; } if (mm->mm_preferred_cnt == 1) { vdev_queue_register_lastoffset( mm->mm_child[mm->mm_preferred[0]].mc_vd, zio); return (mm->mm_preferred[0]); } if (mm->mm_preferred_cnt > 1) { int c = vdev_mirror_preferred_child_randomize(zio); vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio); return (c); } /* * Every device is either missing or has this txg in its DTL. * Look for any child we haven't already tried before giving up. */ for (c = 0; c < mm->mm_children; c++) { if (!mm->mm_child[c].mc_tried) { vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio); return (c); } } /* * Every child failed. There's no place left to look. */ return (-1); } static void vdev_mirror_io_start(zio_t *zio) { mirror_map_t *mm; mirror_child_t *mc; int c, children; mm = vdev_mirror_map_init(zio); if (mm == NULL) { ASSERT(!spa_trust_config(zio->io_spa)); ASSERT(zio->io_type == ZIO_TYPE_READ); zio_execute(zio); return; } if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_bp != NULL && (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering && mm->mm_children > 1) { /* * For scrubbing reads (if we can verify the * checksum here, as indicated by io_bp being * non-NULL) we need to allocate a read buffer for * each child and issue reads to all children. If * any child succeeds, it will copy its data into * zio->io_data in vdev_mirror_scrub_done. */ for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, abd_alloc_sametype(zio->io_abd, zio->io_size), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } zio_execute(zio); return; } /* * For normal reads just pick one child. */ c = vdev_mirror_child_select(zio); children = (c >= 0); } else { ASSERT(zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); /* * Writes and frees go to all children. */ c = 0; children = mm->mm_children; } while (children--) { mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); c++; } zio_execute(zio); } static int vdev_mirror_worst_error(mirror_map_t *mm) { int error[2] = { 0, 0 }; for (int c = 0; c < mm->mm_children; c++) { mirror_child_t *mc = &mm->mm_child[c]; int s = mc->mc_speculative; error[s] = zio_worst_error(error[s], mc->mc_error); } return (error[0] ? error[0] : error[1]); } static void vdev_mirror_io_done(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; mirror_child_t *mc; int c; int good_copies = 0; int unexpected_errors = 0; if (mm == NULL) return; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; if (mc->mc_error) { if (!mc->mc_skipped) unexpected_errors++; } else if (mc->mc_tried) { good_copies++; } } if (zio->io_type == ZIO_TYPE_WRITE) { /* * XXX -- for now, treat partial writes as success. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ /* XXPOLICY */ if (good_copies != mm->mm_children) { /* * Always require at least one good copy. * * For ditto blocks (io_vd == NULL), require * all copies to be good. * * XXX -- for replacing vdevs, there's no great answer. * If the old device is really dead, we may not even * be able to access it -- so we only want to * require good writes to the new device. But if * the new device turns out to be flaky, we want * to be able to detach it -- which requires all * writes to the old device to have succeeded. */ if (good_copies == 0 || zio->io_vd == NULL) zio->io_error = vdev_mirror_worst_error(mm); } return; } else if (zio->io_type == ZIO_TYPE_FREE) { return; } ASSERT(zio->io_type == ZIO_TYPE_READ); /* * If we don't have a good copy yet, keep trying other children. */ /* XXPOLICY */ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { ASSERT(c >= 0 && c < mm->mm_children); mc = &mm->mm_child[c]; zio_vdev_io_redone(zio); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_READ, zio->io_priority, 0, vdev_mirror_child_done, mc)); return; } /* XXPOLICY */ if (good_copies == 0) { zio->io_error = vdev_mirror_worst_error(mm); ASSERT(zio->io_error != 0); } if (good_copies && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER) || ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) { /* * Use the good data we have in hand to repair damaged children. */ for (c = 0; c < mm->mm_children; c++) { /* * Don't rewrite known good children. * Not only is it unnecessary, it could * actually be harmful: if the system lost * power while rewriting the only good copy, * there would be no good copies left! */ mc = &mm->mm_child[c]; if (mc->mc_error == 0) { if (mc->mc_tried) continue; /* * We didn't try this child. We need to * repair it if: * 1. it's a scrub (in which case we have * tried everything that was healthy) * - or - * 2. it's an indirect vdev (in which case * it could point to any other vdev, which * might have a bad DTL) * - or - * 3. the DTL indicates that this data is * missing from this vdev */ if (!(zio->io_flags & ZIO_FLAG_SCRUB) && mc->mc_vd->vdev_ops != &vdev_indirect_ops && !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; mc->mc_error = SET_ERROR(ESTALE); } zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } static void vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) { if (faulted == vd->vdev_children) { if (vdev_children_are_offline(vd)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE, VDEV_AUX_CHILDREN_OFFLINE); } else { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); } } else if (degraded + faulted != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); } else { vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } } vdev_ops_t vdev_mirror_ops = { vdev_mirror_open, vdev_mirror_close, vdev_default_asize, vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, NULL, NULL, NULL, NULL, vdev_default_xlate, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { vdev_mirror_open, vdev_mirror_close, vdev_default_asize, vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, NULL, NULL, NULL, NULL, vdev_default_xlate, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { vdev_mirror_open, vdev_mirror_close, vdev_default_asize, vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, NULL, NULL, NULL, NULL, vdev_default_xlate, VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (revision 344935) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (revision 344936) @@ -1,1055 +1,1055 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include #include #include /* * ZFS I/O Scheduler * --------------- * * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The * I/O scheduler determines when and in what order those operations are * issued. The I/O scheduler divides operations into six I/O classes * prioritized in the following order: sync read, sync write, async read, * async write, scrub/resilver and trim. Each queue defines the minimum and * maximum number of concurrent operations that may be issued to the device. * In addition, the device has an aggregate maximum. Note that the sum of the * per-queue minimums must not exceed the aggregate maximum, and if the * aggregate maximum is equal to or greater than the sum of the per-queue * maximums, the per-queue minimum has no effect. * * For many physical devices, throughput increases with the number of * concurrent operations, but latency typically suffers. Further, physical * devices typically have a limit at which more concurrent operations have no * effect on throughput or can actually cause it to decrease. * * The scheduler selects the next operation to issue by first looking for an * I/O class whose minimum has not been satisfied. Once all are satisfied and * the aggregate maximum has not been hit, the scheduler looks for classes * whose maximum has not been satisfied. Iteration through the I/O classes is * done in the order specified above. No further operations are issued if the * aggregate maximum number of concurrent operations has been hit or if there * are no operations queued for an I/O class that has not hit its maximum. * Every time an I/O is queued or an operation completes, the I/O scheduler * looks for new operations to issue. * * All I/O classes have a fixed maximum number of outstanding operations * except for the async write class. Asynchronous writes represent the data * that is committed to stable storage during the syncing stage for * transaction groups (see txg.c). Transaction groups enter the syncing state * periodically so the number of queued async writes will quickly burst up and * then bleed down to zero. Rather than servicing them as quickly as possible, * the I/O scheduler changes the maximum number of active async write I/Os * according to the amount of dirty data in the pool (see dsl_pool.c). Since * both throughput and latency typically increase with the number of * concurrent operations issued to physical devices, reducing the burstiness * in the number of concurrent operations also stabilizes the response time of * operations from other -- and in particular synchronous -- queues. In broad * strokes, the I/O scheduler will issue more concurrent operations from the * async write queue as there's more dirty data in the pool. * * Async Writes * * The number of concurrent operations issued for the async write I/O class * follows a piece-wise linear function defined by a few adjustable points. * * | o---------| <-- zfs_vdev_async_write_max_active * ^ | /^ | * | | / | | * active | / | | * I/O | / | | * count | / | | * | / | | * |------------o | | <-- zfs_vdev_async_write_min_active * 0|____________^______|_________| * 0% | | 100% of zfs_dirty_data_max * | | * | `-- zfs_vdev_async_write_active_max_dirty_percent * `--------- zfs_vdev_async_write_active_min_dirty_percent * * Until the amount of dirty data exceeds a minimum percentage of the dirty * data allowed in the pool, the I/O scheduler will limit the number of * concurrent operations to the minimum. As that threshold is crossed, the * number of concurrent operations issued increases linearly to the maximum at * the specified maximum percentage of the dirty data allowed in the pool. * * Ideally, the amount of dirty data on a busy pool will stay in the sloped * part of the function between zfs_vdev_async_write_active_min_dirty_percent * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the * maximum percentage, this indicates that the rate of incoming data is * greater than the rate that the backend storage can handle. In this case, we * must further throttle incoming writes (see dmu_tx_delay() for details). */ /* * The maximum number of I/Os active to each device. Ideally, this will be >= * the sum of each queue's max_active. It must be at least the sum of each * queue's min_active. */ uint32_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of I/Os active to each device. If the * sum of the queue's max_active is < zfs_vdev_max_active, then the * min_active comes into play. We will send min_active from each queue, * and then select from queues in the order defined by zio_priority_t. * * In general, smaller max_active's will lead to lower latency of synchronous * operations. Larger max_active's may lead to higher overall throughput, * depending on underlying storage. * * The ratio of the queues' max_actives determines the balance of performance * between reads, writes, and scrubs. E.g., increasing * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete * more quickly, but reads and writes to have higher latency and lower * throughput. */ uint32_t zfs_vdev_sync_read_min_active = 10; uint32_t zfs_vdev_sync_read_max_active = 10; uint32_t zfs_vdev_sync_write_min_active = 10; uint32_t zfs_vdev_sync_write_max_active = 10; uint32_t zfs_vdev_async_read_min_active = 1; uint32_t zfs_vdev_async_read_max_active = 3; uint32_t zfs_vdev_async_write_min_active = 1; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; uint32_t zfs_vdev_trim_min_active = 1; /* * TRIM max active is large in comparison to the other values due to the fact * that TRIM IOs are coalesced at the device layer. This value is set such * that a typical SSD can process the queued IOs in a single request. */ uint32_t zfs_vdev_trim_max_active = 64; uint32_t zfs_vdev_removal_min_active = 1; uint32_t zfs_vdev_removal_max_active = 2; uint32_t zfs_vdev_initializing_min_active = 1; uint32_t zfs_vdev_initializing_max_active = 1; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent * dirty data, use zfs_vdev_async_write_min_active. When it has more than * zfs_vdev_async_write_active_max_dirty_percent, use * zfs_vdev_async_write_max_active. The value is linearly interpolated * between min and max. */ int zfs_vdev_async_write_active_min_dirty_percent = 30; int zfs_vdev_async_write_active_max_dirty_percent = 60; /* * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. * For read I/Os, we also aggregate across small adjacency gaps; for writes * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ int zfs_vdev_aggregation_limit = 1 << 20; int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; /* * Define the queue depth percentage for each top-level. This percentage is * used in conjunction with zfs_vdev_async_max_active to determine how many * allocations a specific top-level vdev should handle. Once the queue depth * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 * then allocator will stop allocating blocks on that top-level device. * The default kernel setting is 1000% which will yield 100 allocations per * device. For userland testing, the default setting is 300% which equates * to 30 allocations per device. */ #ifdef _KERNEL int zfs_vdev_queue_depth_pct = 1000; #else int zfs_vdev_queue_depth_pct = 300; #endif /* * When performing allocations for a given metaslab, we want to make sure that * there are enough IOs to aggregate together to improve throughput. We want to * ensure that there are at least 128k worth of IOs that can be aggregated, and * we assume that the average allocation size is 4k, so we need the queue depth * to be 32 per allocator to get good aggregation of sequential writes. */ int zfs_vdev_def_queue_depth = 32; #ifdef __FreeBSD__ #ifdef _KERNEL SYSCTL_DECL(_vfs_zfs_vdev); static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_zfs_async_write_active_min_dirty_percent, "I", "Percentage of async write dirty data below which " "async_write_min_active is used."); static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_zfs_async_write_active_max_dirty_percent, "I", "Percentage of async write dirty data above which " "async_write_max_active is used."); SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, "The maximum number of I/Os of all types active for each device."); #define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\ &zfs_vdev_ ## name ## _min_active, 0, \ "Initial number of I/O requests of type " #name \ " active for each device"); #define ZFS_VDEV_QUEUE_KNOB_MAX(name) \ SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\ &zfs_vdev_ ## name ## _max_active, 0, \ "Maximum number of I/O requests of type " #name \ " active for each device"); ZFS_VDEV_QUEUE_KNOB_MIN(sync_read); ZFS_VDEV_QUEUE_KNOB_MAX(sync_read); ZFS_VDEV_QUEUE_KNOB_MIN(sync_write); ZFS_VDEV_QUEUE_KNOB_MAX(sync_write); ZFS_VDEV_QUEUE_KNOB_MIN(async_read); ZFS_VDEV_QUEUE_KNOB_MAX(async_read); ZFS_VDEV_QUEUE_KNOB_MIN(async_write); ZFS_VDEV_QUEUE_KNOB_MAX(async_write); ZFS_VDEV_QUEUE_KNOB_MIN(scrub); ZFS_VDEV_QUEUE_KNOB_MAX(scrub); ZFS_VDEV_QUEUE_KNOB_MIN(trim); ZFS_VDEV_QUEUE_KNOB_MAX(trim); ZFS_VDEV_QUEUE_KNOB_MIN(removal); ZFS_VDEV_QUEUE_KNOB_MAX(removal); ZFS_VDEV_QUEUE_KNOB_MIN(initializing); ZFS_VDEV_QUEUE_KNOB_MAX(initializing); #undef ZFS_VDEV_QUEUE_KNOB SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN, &zfs_vdev_aggregation_limit, 0, "I/O requests are aggregated up to this size"); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit_non_rotating, CTLFLAG_RWTUN, &zfs_vdev_aggregation_limit_non_rotating, 0, "I/O requests are aggregated up to this size for non-rotating media"); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN, &zfs_vdev_read_gap_limit, 0, "Acceptable gap between two reads being aggregated"); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN, &zfs_vdev_write_gap_limit, 0, "Acceptable gap between two writes being aggregated"); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN, &zfs_vdev_queue_depth_pct, 0, "Queue depth percentage for each top-level"); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN, &zfs_vdev_def_queue_depth, 0, "Default queue depth for each allocator"); static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS) { int val, err; val = zfs_vdev_async_write_active_min_dirty_percent; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 0 || val > 100 || val >= zfs_vdev_async_write_active_max_dirty_percent) return (EINVAL); zfs_vdev_async_write_active_min_dirty_percent = val; return (0); } static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS) { int val, err; val = zfs_vdev_async_write_active_max_dirty_percent; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 0 || val > 100 || val <= zfs_vdev_async_write_active_min_dirty_percent) return (EINVAL); zfs_vdev_async_write_active_max_dirty_percent = val; return (0); } #endif #endif int vdev_queue_offset_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; int cmp = AVL_CMP(z1->io_offset, z2->io_offset); if (likely(cmp)) return (cmp); return (AVL_PCMP(z1, z2)); } static inline avl_tree_t * vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) { return (&vq->vq_class[p].vqc_queued_tree); } static inline avl_tree_t * vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) { if (t == ZIO_TYPE_READ) return (&vq->vq_read_offset_tree); else if (t == ZIO_TYPE_WRITE) return (&vq->vq_write_offset_tree); else return (NULL); } int vdev_queue_timestamp_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_timestamp < z2->io_timestamp) return (-1); if (z1->io_timestamp > z2->io_timestamp) return (1); if (z1->io_offset < z2->io_offset) return (-1); if (z1->io_offset > z2->io_offset) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } void vdev_queue_init(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); vq->vq_vdev = vd; avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_queue_node)); avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { int (*compfn) (const void *, const void *); /* * The synchronous i/o queues are dispatched in FIFO rather * than LBA order. This provides more consistent latency for * these i/os. */ if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE) compfn = vdev_queue_timestamp_compare; else compfn = vdev_queue_offset_compare; avl_create(vdev_queue_class_tree(vq, p), compfn, sizeof (zio_t), offsetof(struct zio, io_queue_node)); } vq->vq_lastoffset = 0; } void vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) avl_destroy(vdev_queue_class_tree(vq, p)); avl_destroy(&vq->vq_active_tree); avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); mutex_destroy(&vq->vq_lock); } static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; avl_tree_t *qtt; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); qtt = vdev_queue_type_tree(vq, zio->io_type); if (qtt) avl_add(qtt, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); spa->spa_queue_stats[zio->io_priority].spa_queued++; if (spa->spa_iokstat != NULL) kstat_waitq_enter(spa->spa_iokstat->ks_data); mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; avl_tree_t *qtt; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); qtt = vdev_queue_type_tree(vq, zio->io_type); if (qtt) avl_remove(qtt, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0); spa->spa_queue_stats[zio->io_priority].spa_queued--; if (spa->spa_iokstat != NULL) kstat_waitq_exit(spa->spa_iokstat->ks_data); mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active++; avl_add(&vq->vq_active_tree, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); spa->spa_queue_stats[zio->io_priority].spa_active++; if (spa->spa_iokstat != NULL) kstat_runq_enter(spa->spa_iokstat->ks_data); mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active--; avl_remove(&vq->vq_active_tree, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0); spa->spa_queue_stats[zio->io_priority].spa_active--; if (spa->spa_iokstat != NULL) { kstat_io_t *ksio = spa->spa_iokstat->ks_data; kstat_runq_exit(spa->spa_iokstat->ks_data); if (zio->io_type == ZIO_TYPE_READ) { ksio->reads++; ksio->nread += zio->io_size; } else if (zio->io_type == ZIO_TYPE_WRITE) { ksio->writes++; ksio->nwritten += zio->io_size; } } mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_agg_io_done(zio_t *aio) { if (aio->io_type == ZIO_TYPE_READ) { zio_t *pio; zio_link_t *zl = NULL; while ((pio = zio_walk_parents(aio, &zl)) != NULL) { abd_copy_off(pio->io_abd, aio->io_abd, 0, pio->io_offset - aio->io_offset, pio->io_size); } } abd_free(aio->io_abd); } static int vdev_queue_class_min_active(zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: return (zfs_vdev_sync_read_min_active); case ZIO_PRIORITY_SYNC_WRITE: return (zfs_vdev_sync_write_min_active); case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_min_active); case ZIO_PRIORITY_ASYNC_WRITE: return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_min_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); case ZIO_PRIORITY_REMOVAL: return (zfs_vdev_removal_min_active); case ZIO_PRIORITY_INITIALIZING: return (zfs_vdev_initializing_min_active); default: panic("invalid priority %u", p); return (0); } } static __noinline int vdev_queue_max_async_writes(spa_t *spa) { int writes; uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total; uint64_t min_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_min_dirty_percent / 100; uint64_t max_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_max_dirty_percent / 100; /* * Sync tasks correspond to interactive user actions. To reduce the * execution time of those actions we push data out as fast as possible. */ if (spa_has_pending_synctask(spa)) { return (zfs_vdev_async_write_max_active); } if (dirty < min_bytes) return (zfs_vdev_async_write_min_active); if (dirty > max_bytes) return (zfs_vdev_async_write_max_active); /* * linear interpolation: * slope = (max_writes - min_writes) / (max_bytes - min_bytes) * move right by min_bytes * move up by min_writes */ writes = (dirty - min_bytes) * (zfs_vdev_async_write_max_active - zfs_vdev_async_write_min_active) / (max_bytes - min_bytes) + zfs_vdev_async_write_min_active; ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); return (writes); } static int vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: return (zfs_vdev_sync_read_max_active); case ZIO_PRIORITY_SYNC_WRITE: return (zfs_vdev_sync_write_max_active); case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); case ZIO_PRIORITY_REMOVAL: return (zfs_vdev_removal_max_active); case ZIO_PRIORITY_INITIALIZING: return (zfs_vdev_initializing_max_active); default: panic("invalid priority %u", p); return (0); } } /* * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if * there is no eligible class. */ static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { spa_t *spa = vq->vq_vdev->vdev_spa; zio_priority_t p; ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); /* find a queue that has not reached its minimum # outstanding i/os */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_min_active(p)) return (p); } /* * If we haven't found a queue, look for one that hasn't reached its * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_max_active(spa, p)) return (p); } /* No eligible queued i/os */ return (ZIO_PRIORITY_NUM_QUEUEABLE); } /* * Compute the range spanned by two i/os, which is the endpoint of the last * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. */ #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) static zio_t * vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) { zio_t *first, *last, *aio, *dio, *mandatory, *nio; zio_link_t *zl = NULL; uint64_t maxgap = 0; uint64_t size; uint64_t limit; int maxblocksize; boolean_t stretch; avl_tree_t *t; enum zio_flag flags; ASSERT(MUTEX_HELD(&vq->vq_lock)); maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); - if (vq->vq_vdev->vdev_rotation_rate == VDEV_RATE_NON_ROTATING) + if (vq->vq_vdev->vdev_nonrot) limit = zfs_vdev_aggregation_limit_non_rotating; else limit = zfs_vdev_aggregation_limit; limit = MAX(MIN(limit, maxblocksize), 0); if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || zio->io_size >= limit) return (NULL); first = last = zio; if (zio->io_type == ZIO_TYPE_READ) maxgap = zfs_vdev_read_gap_limit; /* * We can aggregate I/Os that are sufficiently adjacent and of * the same flavor, as expressed by the AGG_INHERIT flags. * The latter requirement is necessary so that certain * attributes of the I/O, such as whether it's a normal I/O * or a scrub/resilver, can be preserved in the aggregate. * We can include optional I/Os, but don't allow them * to begin a range as they add no benefit in that situation. */ /* * We keep track of the last non-optional I/O. */ mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; /* * Walk backwards through sufficiently contiguous I/Os * recording the last non-optional I/O. */ flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; t = vdev_queue_type_tree(vq, zio->io_type); while (t != NULL && (dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= limit && IO_GAP(dio, first) <= maxgap && dio->io_type == zio->io_type) { first = dio; if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = first; } /* * Skip any initial optional I/Os. */ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { first = AVL_NEXT(t, first); ASSERT(first != NULL); } /* * Walk forward through sufficiently contiguous I/Os. * The aggregation limit does not apply to optional i/os, so that * we can issue contiguous writes even if they are larger than the * aggregation limit. */ while ((dio = AVL_NEXT(t, last)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (IO_SPAN(first, dio) <= limit || (dio->io_flags & ZIO_FLAG_OPTIONAL)) && IO_SPAN(first, dio) <= maxblocksize && IO_GAP(last, dio) <= maxgap && dio->io_type == zio->io_type) { last = dio; if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = last; } /* * Now that we've established the range of the I/O aggregation * we must decide what to do with trailing optional I/Os. * For reads, there's nothing to do. While we are unable to * aggregate further, it's possible that a trailing optional * I/O would allow the underlying device to aggregate with * subsequent I/Os. We must therefore determine if the next * non-optional I/O is close enough to make aggregation * worthwhile. */ stretch = B_FALSE; if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { zio_t *nio = last; while ((dio = AVL_NEXT(t, nio)) != NULL && IO_GAP(nio, dio) == 0 && IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { nio = dio; if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { stretch = B_TRUE; break; } } } if (stretch) { /* * We are going to include an optional io in our aggregated * span, thus closing the write gap. Only mandatory i/os can * start aggregated spans, so make sure that the next i/o * after our span is mandatory. */ dio = AVL_NEXT(t, last); dio->io_flags &= ~ZIO_FLAG_OPTIONAL; } else { /* do not include the optional i/o */ while (last != mandatory && last != first) { ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); last = AVL_PREV(t, last); ASSERT(last != NULL); } } if (first == last) return (NULL); size = IO_SPAN(first, last); ASSERT3U(size, <=, maxblocksize); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, abd_alloc_for_io(size, B_TRUE), size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; do { dio = nio; nio = AVL_NEXT(t, dio); ASSERT3U(dio->io_type, ==, aio->io_type); if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); abd_zero_off(aio->io_abd, dio->io_offset - aio->io_offset, dio->io_size); } else if (dio->io_type == ZIO_TYPE_WRITE) { abd_copy_off(aio->io_abd, dio->io_abd, dio->io_offset - aio->io_offset, 0, dio->io_size); } zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); } while (dio != last); /* * We need to drop the vdev queue's lock to avoid a deadlock that we * could encounter since this I/O will complete immediately. */ mutex_exit(&vq->vq_lock); while ((dio = zio_walk_parents(aio, &zl)) != NULL) { zio_vdev_io_bypass(dio); zio_execute(dio); } mutex_enter(&vq->vq_lock); return (aio); } static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq) { zio_t *zio, *aio; zio_priority_t p; avl_index_t idx; avl_tree_t *tree; zio_t search; again: ASSERT(MUTEX_HELD(&vq->vq_lock)); p = vdev_queue_class_to_issue(vq); if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { /* No eligible queued i/os */ return (NULL); } /* * For LBA-ordered queues (async / scrub / initializing), issue the * i/o which follows the most recently issued i/o in LBA (offset) order. * * For FIFO queues (sync), issue the i/o with the lowest timestamp. */ tree = vdev_queue_class_tree(vq, p); search.io_timestamp = 0; search.io_offset = vq->vq_last_offset + 1; VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); zio = avl_nearest(tree, idx, AVL_AFTER); if (zio == NULL) zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); if (aio != NULL) zio = aio; else vdev_queue_io_remove(vq, zio); /* * If the I/O is or was optional and therefore has no data, we need to * simply discard it. We need to drop the vdev queue's lock to avoid a * deadlock that we could encounter since this I/O will complete * immediately. */ if (zio->io_flags & ZIO_FLAG_NODATA) { mutex_exit(&vq->vq_lock); zio_vdev_io_bypass(zio); zio_execute(zio); mutex_enter(&vq->vq_lock); goto again; } vdev_queue_pending_add(vq, zio); vq->vq_last_offset = zio->io_offset; return (zio); } zio_t * vdev_queue_io(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) return (zio); /* * Children i/os inherent their parent's priority, which might * not match the child's i/o type. Fix it up here. */ if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB && zio->io_priority != ZIO_PRIORITY_REMOVAL && zio->io_priority != ZIO_PRIORITY_INITIALIZING) zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && zio->io_priority != ZIO_PRIORITY_REMOVAL && zio->io_priority != ZIO_PRIORITY_INITIALIZING) zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } else { ASSERT(zio->io_type == ZIO_TYPE_FREE); zio->io_priority = ZIO_PRIORITY_TRIM; } zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; mutex_enter(&vq->vq_lock); zio->io_timestamp = gethrtime(); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); mutex_exit(&vq->vq_lock); if (nio == NULL) return (NULL); if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); return (NULL); } return (nio); } void vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; mutex_enter(&vq->vq_lock); vdev_queue_pending_remove(vq, zio); vq->vq_io_complete_ts = gethrtime(); while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { mutex_exit(&vq->vq_lock); if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); } else { zio_vdev_io_reissue(nio); zio_execute(nio); } mutex_enter(&vq->vq_lock); } mutex_exit(&vq->vq_lock); } void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; avl_tree_t *tree; /* * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio * code to issue IOs without adding them to the vdev queue. In this * case, the zio is already going to be issued as quickly as possible * and so it doesn't need any reprioitization to help. */ if (zio->io_priority == ZIO_PRIORITY_NOW) return; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (zio->io_type == ZIO_TYPE_READ) { if (priority != ZIO_PRIORITY_SYNC_READ && priority != ZIO_PRIORITY_ASYNC_READ && priority != ZIO_PRIORITY_SCRUB) priority = ZIO_PRIORITY_ASYNC_READ; } else { ASSERT(zio->io_type == ZIO_TYPE_WRITE); if (priority != ZIO_PRIORITY_SYNC_WRITE && priority != ZIO_PRIORITY_ASYNC_WRITE) priority = ZIO_PRIORITY_ASYNC_WRITE; } mutex_enter(&vq->vq_lock); /* * If the zio is in none of the queues we can simply change * the priority. If the zio is waiting to be submitted we must * remove it from the queue and re-insert it with the new priority. * Otherwise, the zio is currently active and we cannot change its * priority. */ tree = vdev_queue_class_tree(vq, zio->io_priority); if (avl_find(tree, zio, NULL) == zio) { avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); zio->io_priority = priority; avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { zio->io_priority = priority; } mutex_exit(&vq->vq_lock); } /* * As these three methods are only used for load calculations we're not concerned * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex * use here, instead we prefer to keep it lock free for performance. */ int vdev_queue_length(vdev_t *vd) { return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); } uint64_t vdev_queue_lastoffset(vdev_t *vd) { return (vd->vdev_queue.vq_lastoffset); } void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio) { vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size; }